From 5739881e2ca70811fd299978262e6accbad38650 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 21 Feb 2019 21:23:42 +0100
Subject: [PATCH 001/146] Implement a lowering for the intrinsics generated by
 'LateLowerGCFrame'

---
 src/compiler/optim.jl | 106 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index b9ddf32a..305df00d 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -357,6 +357,112 @@ function lower_gc_frame!(fun::LLVM.Function)
     return changed
 end
 
+# Visits all calls to a particular intrinsic in a given LLVM module.
+function visit_intrinsic(visit_call::Function, name::AbstractString, mod::LLVM.Module)
+    if haskey(functions(mod), name)
+        func = functions(mod)[name]
+
+        for use in uses(func)
+            call = user(use)::LLVM.CallInst
+            visit_call(call)
+        end
+    end
+end
+
+# Lowers the GC intrinsics produce by the LateLowerGCFrame pass. These
+# intrinsics are the last point at which we can intervene in the pipeline
+# before the passes that deal with them become CPU-specific.
+function lower_final_gc_intrinsics!(mod::LLVM.Module)
+    ctx = global_ctx::CompilerContext
+    changed = false
+
+    # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates
+    # store for an object, including headroom, but does not set the object's
+    # tag.
+    visit_intrinsic("julia.gc_alloc_bytes", mod) do call
+        # Decode the call.
+        ops = collect(operands(call))
+        sz = ops[2]
+
+        # We need to reserve a single pointer of headroom for the tag.
+        # (LateLowerGCFrame depends on us doing that.)
+        headroom = Runtime.tag_size
+
+        # Call the allocation function and bump the resulting pointer
+        # so the headroom sits just in front of the returned pointer.
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            ptr = call!(builder, Runtime.get(:gc_pool_alloc), [ConstantInt(Int32(headroom) + sz, JuliaContext())])
+            bumped_ptr = gep!(builder, ptr, [ConstantInt(1, JuliaContext())])
+            replace_uses!(call, bumped_ptr)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame.
+    # We'll lower it as an alloca and hope SSA construction and DCE passes
+    # get rid of the alloca. This is a reasonable thing to hope for because
+    # all intrinsics that may cause the GC frame to escape will be replaced by
+    # nops.
+    visit_intrinsic("julia.new_gc_frame", mod) do call
+        new_gc_frame = functions(mod)["julia.new_gc_frame"]
+
+        # Decode the call.
+        ops = collect(operands(call))
+        sz = ops[1]
+
+        # Call the allocation function and bump the resulting pointer
+        # so the headroom sits just in front of the returned pointer.
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            ptr = array_alloca!(builder, eltype(return_type(new_gc_frame)), [sz])
+            replace_uses!(call, ptr)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # The 'julia.get_gc_frame_slot' is closely related to the previous
+    # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of
+    # a slot in the GC frame. We can simply turn this intrinsic into a GEP.
+    visit_intrinsic("julia.get_gc_frame_slot", mod) do call
+        # Decode the call.
+        ops = collect(operands(call))
+        frame = ops[1]
+        offset = ops[2]
+
+        # Call the allocation function and bump the resulting pointer
+        # so the headroom sits just in front of the returned pointer.
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            ptr = gep!(builder, frame, [offset])
+            replace_uses!(call, ptr)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # The 'julia.push_gc_frame' registers a GC frame with the GC. We
+    # don't have a GC, so we can just delete calls to this intrinsic!
+    visit_intrinsic("julia.push_gc_frame", mod) do call
+        unsafe_delete!(LLVM.parent(call), call)
+        changed = true
+    end
+
+    # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so
+    # we can just delete calls to this intrinsic, too.
+    visit_intrinsic("julia.pop_gc_frame", mod) do call
+        unsafe_delete!(LLVM.parent(call), call)
+        changed = true
+    end
+
+    return changed
+end
+
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
 #
 # this assumes and checks that the TLS is unused, which should be the case for most GPU code

From 61b9f94529acee4a091b75139f41b998ada9d9fd Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 22 Feb 2019 11:47:09 +0100
Subject: [PATCH 002/146] Also lower 'julia.queue_gc_root'

---
 src/compiler/optim.jl | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 305df00d..91069ba3 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -358,7 +358,7 @@ function lower_gc_frame!(fun::LLVM.Function)
 end
 
 # Visits all calls to a particular intrinsic in a given LLVM module.
-function visit_intrinsic(visit_call::Function, name::AbstractString, mod::LLVM.Module)
+function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Module)
     if haskey(functions(mod), name)
         func = functions(mod)[name]
 
@@ -369,6 +369,17 @@ function visit_intrinsic(visit_call::Function, name::AbstractString, mod::LLVM.M
     end
 end
 
+# Deletes all calls to a particular intrinsic in a given LLVM module.
+# Returns a Boolean that tells if any calls were actually deleted.
+function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool
+    changed = false
+    visit_calls_to(name, mod) do call
+        unsafe_delete!(LLVM.parent(call), call)
+        changed = true
+    end
+    return changed
+end
+
 # Lowers the GC intrinsics produce by the LateLowerGCFrame pass. These
 # intrinsics are the last point at which we can intervene in the pipeline
 # before the passes that deal with them become CPU-specific.
@@ -379,7 +390,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module)
     # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates
     # store for an object, including headroom, but does not set the object's
     # tag.
-    visit_intrinsic("julia.gc_alloc_bytes", mod) do call
+    visit_calls_to("julia.gc_alloc_bytes", mod) do call
         # Decode the call.
         ops = collect(operands(call))
         sz = ops[2]
@@ -406,7 +417,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module)
     # get rid of the alloca. This is a reasonable thing to hope for because
     # all intrinsics that may cause the GC frame to escape will be replaced by
     # nops.
-    visit_intrinsic("julia.new_gc_frame", mod) do call
+    visit_calls_to("julia.new_gc_frame", mod) do call
         new_gc_frame = functions(mod)["julia.new_gc_frame"]
 
         # Decode the call.
@@ -428,7 +439,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module)
     # The 'julia.get_gc_frame_slot' is closely related to the previous
     # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of
     # a slot in the GC frame. We can simply turn this intrinsic into a GEP.
-    visit_intrinsic("julia.get_gc_frame_slot", mod) do call
+    visit_calls_to("julia.get_gc_frame_slot", mod) do call
         # Decode the call.
         ops = collect(operands(call))
         frame = ops[1]
@@ -448,17 +459,14 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module)
 
     # The 'julia.push_gc_frame' registers a GC frame with the GC. We
     # don't have a GC, so we can just delete calls to this intrinsic!
-    visit_intrinsic("julia.push_gc_frame", mod) do call
-        unsafe_delete!(LLVM.parent(call), call)
-        changed = true
-    end
+    changed |= delete_calls_to!("julia.push_gc_frame", mod)
 
     # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so
     # we can just delete calls to this intrinsic, too.
-    visit_intrinsic("julia.pop_gc_frame", mod) do call
-        unsafe_delete!(LLVM.parent(call), call)
-        changed = true
-    end
+    changed |= delete_calls_to!("julia.pop_gc_frame", mod)
+
+    # Ditto for 'julia.queue_gc_root'.
+    changed |= delete_calls_to!("julia.queue_gc_root", mod)
 
     return changed
 end

From a921f3a139da947c2f60e58e0444278e91facf00 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 22 Feb 2019 12:55:40 +0100
Subject: [PATCH 003/146] Fix correctness bugs in the new GC lowering pass

---
 src/compiler/optim.jl | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 91069ba3..647e83c4 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -364,7 +364,7 @@ function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Mo
 
         for use in uses(func)
             call = user(use)::LLVM.CallInst
-            visit_call(call)
+            visit_call(call, func)
         end
     end
 end
@@ -373,7 +373,7 @@ end
 # Returns a Boolean that tells if any calls were actually deleted.
 function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool
     changed = false
-    visit_calls_to(name, mod) do call
+    visit_calls_to(name, mod) do call, _
         unsafe_delete!(LLVM.parent(call), call)
         changed = true
     end
@@ -390,10 +390,14 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module)
     # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates
     # store for an object, including headroom, but does not set the object's
     # tag.
-    visit_calls_to("julia.gc_alloc_bytes", mod) do call
+    visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes
+        gc_alloc_bytes_ft = eltype(llvmtype(gc_alloc_bytes))::LLVM.FunctionType
+        T_ret = return_type(gc_alloc_bytes_ft)::LLVM.PointerType
+        T_bitcast = LLVM.PointerType(T_ret, LLVM.addrspace(T_ret))
+
         # Decode the call.
         ops = collect(operands(call))
-        sz = ops[2]
+        size = ops[2]
 
         # We need to reserve a single pointer of headroom for the tag.
         # (LateLowerGCFrame depends on us doing that.)
@@ -403,9 +407,12 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module)
         # so the headroom sits just in front of the returned pointer.
         let builder = Builder(JuliaContext())
             position!(builder, call)
-            ptr = call!(builder, Runtime.get(:gc_pool_alloc), [ConstantInt(Int32(headroom) + sz, JuliaContext())])
-            bumped_ptr = gep!(builder, ptr, [ConstantInt(1, JuliaContext())])
+            total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext()))
+            ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size])
+            cast_ptr = bitcast!(builder, ptr, T_bitcast)
+            bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())])
             replace_uses!(call, bumped_ptr)
+            unsafe_delete!(LLVM.parent(call), call)
             dispose(builder)
         end
 
@@ -417,19 +424,22 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module)
     # get rid of the alloca. This is a reasonable thing to hope for because
     # all intrinsics that may cause the GC frame to escape will be replaced by
     # nops.
-    visit_calls_to("julia.new_gc_frame", mod) do call
-        new_gc_frame = functions(mod)["julia.new_gc_frame"]
+    visit_calls_to("julia.new_gc_frame", mod) do call, new_gc_frame
+        new_gc_frame_ft = eltype(llvmtype(new_gc_frame))::LLVM.FunctionType
+        T_ret = return_type(new_gc_frame_ft)::LLVM.PointerType
+        T_alloca = eltype(T_ret)
 
         # Decode the call.
         ops = collect(operands(call))
-        sz = ops[1]
+        size = ops[1]
 
         # Call the allocation function and bump the resulting pointer
         # so the headroom sits just in front of the returned pointer.
         let builder = Builder(JuliaContext())
             position!(builder, call)
-            ptr = array_alloca!(builder, eltype(return_type(new_gc_frame)), [sz])
+            ptr = array_alloca!(builder, T_alloca, size)
             replace_uses!(call, ptr)
+            unsafe_delete!(LLVM.parent(call), call)
             dispose(builder)
         end
 
@@ -439,7 +449,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module)
     # The 'julia.get_gc_frame_slot' is closely related to the previous
     # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of
     # a slot in the GC frame. We can simply turn this intrinsic into a GEP.
-    visit_calls_to("julia.get_gc_frame_slot", mod) do call
+    visit_calls_to("julia.get_gc_frame_slot", mod) do call, _
         # Decode the call.
         ops = collect(operands(call))
         frame = ops[1]
@@ -451,6 +461,7 @@ function lower_final_gc_intrinsics!(mod::LLVM.Module)
             position!(builder, call)
             ptr = gep!(builder, frame, [offset])
             replace_uses!(call, ptr)
+            unsafe_delete!(LLVM.parent(call), call)
             dispose(builder)
         end
 

From 80af54b760c427cf716adad65aefb22f79bd194a Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 27 Feb 2019 11:04:45 +0100
Subject: [PATCH 004/146] Use the new GC intrinsic lowering

Note: these changes depend on the 'configurable-lowering-2' branch of my
fork of the julia repo (jonathanvdc/julia). The lowering scheme won't work
unless that version of Julia is used.
---
 src/compiler/optim.jl | 60 +++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 647e83c4..55b16ca6 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -27,7 +27,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
 
         ModulePassManager() do pm
             initialize!(pm)
-            add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
+            add!(pm, FunctionPass("LowerGCFrame", eager_lower_gc_frame!))
             aggressive_dce!(pm) # remove dead uses of ptls
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
             run!(pm, mod)
@@ -45,15 +45,13 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
             initialize!(pm)
             ccall(:jl_add_optimization_passes, Cvoid,
                   (LLVM.API.LLVMPassManagerRef, Cint, Cint),
-                  LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0)
+                  LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 1)
             run!(pm, mod)
         end
 
         ModulePassManager() do pm
             initialize!(pm)
-
-            # lower intrinsics
-            add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
+            add!(pm, ModulePass("FinalLowerGCGPU", lower_final_gc_intrinsics!))
             aggressive_dce!(pm) # remove dead uses of ptls
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
@@ -298,6 +296,30 @@ function fixup_metadata!(f::LLVM.Function)
     end
 end
 
+# Visits all calls to a particular intrinsic in a given LLVM module.
+function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Module)
+    if haskey(functions(mod), name)
+        func = functions(mod)[name]
+
+        for use in uses(func)
+            call = user(use)::LLVM.CallInst
+            visit_call(call, func)
+        end
+    end
+end
+
+# Deletes all calls to a particular intrinsic in a given LLVM module.
+# Returns a Boolean that tells if any calls were actually deleted.
+function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool
+    changed = false
+    visit_calls_to(name, mod) do call, _
+        unsafe_delete!(LLVM.parent(call), call)
+        changed = true
+    end
+    return changed
+end
+
+
 # lower object allocations to to PTX malloc
 #
 # this is a PoC implementation that is very simple: allocate, and never free. it also runs
@@ -306,7 +328,7 @@ end
 # is currently very architecture/CPU specific: hard-coded pool sizes, TLS references, etc.
 # such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit
 # lower-level intrinsics which then can be lowered to architecture-specific code.
-function lower_gc_frame!(fun::LLVM.Function)
+function eager_lower_gc_frame!(fun::LLVM.Function)
     job = current_job::CompilerJob
     mod = LLVM.parent(fun)
     changed = false
@@ -357,34 +379,10 @@ function lower_gc_frame!(fun::LLVM.Function)
     return changed
 end
 
-# Visits all calls to a particular intrinsic in a given LLVM module.
-function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Module)
-    if haskey(functions(mod), name)
-        func = functions(mod)[name]
-
-        for use in uses(func)
-            call = user(use)::LLVM.CallInst
-            visit_call(call, func)
-        end
-    end
-end
-
-# Deletes all calls to a particular intrinsic in a given LLVM module.
-# Returns a Boolean that tells if any calls were actually deleted.
-function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool
-    changed = false
-    visit_calls_to(name, mod) do call, _
-        unsafe_delete!(LLVM.parent(call), call)
-        changed = true
-    end
-    return changed
-end
-
-# Lowers the GC intrinsics produce by the LateLowerGCFrame pass. These
+# Lowers the GC intrinsics produced by the LateLowerGCFrame pass. These
 # intrinsics are the last point at which we can intervene in the pipeline
 # before the passes that deal with them become CPU-specific.
 function lower_final_gc_intrinsics!(mod::LLVM.Module)
-    ctx = global_ctx::CompilerContext
     changed = false
 
     # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates

From f177a271a3e79e0823e6c4257ee3f3e1b27f0d6e Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 28 Feb 2019 15:33:15 +0100
Subject: [PATCH 005/146] Add a simple unified memory example

---
 examples/shared-memory.jl | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 examples/shared-memory.jl

diff --git a/examples/shared-memory.jl b/examples/shared-memory.jl
new file mode 100644
index 00000000..e0fede72
--- /dev/null
+++ b/examples/shared-memory.jl
@@ -0,0 +1,31 @@
+using CUDAdrv, CUDAnative, CuArrays
+
+using Test
+
+# Allocates an array of host memory that is page-locked and accessible
+# to the device. Maps the allocation into the CUDA address space.
+# Returns a (host array, CuArray) pair. The former can be used by
+# the host to access the array, the latter can be used by the device.
+function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
+    # Allocate memory that is accessible to both the host and the device.
+    device_buffer = Mem.alloc(prod(dims) * sizeof(T), true)
+
+    # Wrap the memory in an array for the host.
+    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(device_buffer.ptr), dims; own = false)
+
+    # Initialize the array's contents.
+    fill!(host_array, init)
+
+    return host_array, CuArray{T, N}(device_buffer, dims; own = false)
+end
+
+# Allocate a shared array.
+dims = (2,4)
+host_array, device_array = alloc_shared_array(dims, Int32(42))
+
+# Write some values to the array.
+host_array[1, 2] = 10
+host_array[2, 1] = 0
+
+# Check that the host's version of the array is the same as the device's.
+@test host_array == Array(device_array)

From 5fd8a0a83875434321e2b1c73877c406ceecd79d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 28 Feb 2019 16:14:34 +0100
Subject: [PATCH 006/146] Add a host-to-device communication example

---
 examples/host-comm.jl     | 77 +++++++++++++++++++++++++++++++++++++++
 examples/shared-memory.jl | 11 +++++-
 2 files changed, 86 insertions(+), 2 deletions(-)
 create mode 100644 examples/host-comm.jl

diff --git a/examples/host-comm.jl b/examples/host-comm.jl
new file mode 100644
index 00000000..2467b680
--- /dev/null
+++ b/examples/host-comm.jl
@@ -0,0 +1,77 @@
+using CUDAdrv, CUDAnative, CuArrays
+import CUDAdrv: @apicall
+using Test
+
+# Allocates an array of host memory that is page-locked and accessible
+# to the device. Maps the allocation into the CUDA address space.
+# Returns a (host array, CuArray) pair. The former can be used by
+# the host to access the array, the latter can be used by the device.
+function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
+    # Allocate memory that is accessible to both the host and the device.
+    bytesize = prod(dims) * sizeof(T)
+    ptr_ref = Ref{Ptr{Cvoid}}()
+    @apicall(
+        :cuMemAllocHost,
+        (Ptr{Ptr{Cvoid}}, Csize_t),
+        ptr_ref, bytesize)
+    device_buffer = CUDAdrv.Mem.Buffer(ptr_ref[], bytesize, CuCurrentContext())
+
+    # Wrap the memory in an array for the host.
+    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false)
+
+    # Initialize the array's contents.
+    fill!(host_array, init)
+
+    return host_array, CuArray{T, N}(device_buffer, dims; own = false)
+end
+
+# This example shows that devices can communicate with the host
+# and vice-versa *during* the execution of a kernel.
+#
+# What happens is, in chronological order:
+#
+#   1. A buffer is zero-initialized by the host.
+#   2. A kernel is started on the device; said kernel
+#      waits for the buffer to become nonzero.
+#   3. The host makes the buffer nonzero.
+#   4. The kernel exists once the buffer is nonzero.
+#
+
+function spin(a)
+    i = threadIdx().x + blockDim().x * (blockIdx().x-1)
+    # Make sure that 'a[i]' is actually zero when we get started.
+    if a[i] != 0.f0
+        return
+    end
+
+    # We wait for the host to set 'a[i]' to a nonzero value.
+    while true
+        ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
+        if a[i] != 0.f0
+            break
+        end
+    end
+    # Next, we set 'a[i]' to some magic value.
+    a[i] = 42.f0
+    return
+end
+
+# Allocate a shared array.
+dims = (3,4)
+host_array, device_array = alloc_shared_array(dims, 0.f0)
+
+# Launch the kernel.
+@cuda threads=prod(dims) spin(device_array)
+
+# Go to sleep for a few milliseconds, to make sure
+# that the kernel will have started already.
+sleep(0.2)
+
+# Fill the array with ones now to unblock the kernel.
+fill!(host_array, 1.f0)
+
+# Wait for the kernel to exit.
+synchronize()
+
+# Check that the array has been set to the magic value.
+@test host_array == fill(42.f0, dims)
diff --git a/examples/shared-memory.jl b/examples/shared-memory.jl
index e0fede72..9b946f73 100644
--- a/examples/shared-memory.jl
+++ b/examples/shared-memory.jl
@@ -1,4 +1,5 @@
 using CUDAdrv, CUDAnative, CuArrays
+import CUDAdrv: @apicall
 
 using Test
 
@@ -8,10 +9,16 @@ using Test
 # the host to access the array, the latter can be used by the device.
 function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
     # Allocate memory that is accessible to both the host and the device.
-    device_buffer = Mem.alloc(prod(dims) * sizeof(T), true)
+    bytesize = prod(dims) * sizeof(T)
+    ptr_ref = Ref{Ptr{Cvoid}}()
+    @apicall(
+        :cuMemAllocHost,
+        (Ptr{Ptr{Cvoid}}, Csize_t),
+        ptr_ref, bytesize)
+    device_buffer = CUDAdrv.Mem.Buffer(ptr_ref[], bytesize, CuCurrentContext())
 
     # Wrap the memory in an array for the host.
-    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(device_buffer.ptr), dims; own = false)
+    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false)
 
     # Initialize the array's contents.
     fill!(host_array, init)

From 1c250c74e23603a568f60c4303daa736587fdad0 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 28 Feb 2019 17:32:50 +0100
Subject: [PATCH 007/146] Fix an outdated comment

---
 examples/host-comm.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/host-comm.jl b/examples/host-comm.jl
index 2467b680..0f33e550 100644
--- a/examples/host-comm.jl
+++ b/examples/host-comm.jl
@@ -34,7 +34,8 @@ end
 #   2. A kernel is started on the device; said kernel
 #      waits for the buffer to become nonzero.
 #   3. The host makes the buffer nonzero.
-#   4. The kernel exists once the buffer is nonzero.
+#   4. The kernel sets the buffer to a magic value and exits
+#      once the buffer is nonzero.
 #
 
 function spin(a)

From f8e6c4b8c266de29e40a8c2fc2e56a7110d54efc Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 1 Mar 2019 18:03:08 +0100
Subject: [PATCH 008/146] Add a kwarg to '@cuda' that serves as a hook for
 kernel setup

The 'init' kwarg to '@cuda' allows users to define custom kernel
initialization logic, which is run just prior to the kernel.
The main use case for this kwarg right now is setting up globals.
---
 src/execution.jl | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index aea26da4..10f9faa9 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -8,7 +8,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
 # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and
 # the code it generates, or the execution
 function split_kwargs(kwargs)
-    macro_kws    = [:dynamic]
+    macro_kws    = [:dynamic, :init]
     compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs]
     call_kws     = [:cooperative, :blocks, :threads, :shmem, :stream]
     macro_kwargs = []
@@ -137,13 +137,14 @@ macro cuda(ex...)
 
     # handle keyword arguments that influence the macro's behavior
     dynamic = false
+    env_kwargs = []
     for kwarg in macro_kwargs
         key,val = kwarg.args
         if key == :dynamic
             isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value"))
             dynamic = val::Bool
         else
-            throw(ArgumentError("Unsupported keyword argument '$key'"))
+            push!(env_kwargs, kwarg)
         end
     end
 
@@ -159,6 +160,7 @@ macro cuda(ex...)
                 # we're in kernel land already, so no need to cudaconvert arguments
                 local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
                 local kernel = dynamic_cufunction($(esc(f)), kernel_tt)
+                prepare_kernel(kernel; $(map(esc, env_kwargs)...))
                 kernel($(var_exprs...); $(map(esc, call_kwargs)...))
              end)
     else
@@ -173,6 +175,7 @@ macro cuda(ex...)
                     local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
                     local kernel = cufunction($(esc(f)), kernel_tt;
                                               $(map(esc, compiler_kwargs)...))
+                    prepare_kernel(kernel; $(map(esc, env_kwargs)...))
                     kernel(kernel_args...; $(map(esc, call_kwargs)...))
                 end
              end)
@@ -436,9 +439,25 @@ end
     return ex
 end
 
+"""
+    prepare_kernel(kernel::Kernel{F,TT}; init::Function=nop_init_kernel)
+
+Prepares a kernel for execution by setting up an environment for that kernel.
+This function should be invoked just prior to running the kernel. Its
+functionality is included in [`@cuda`](@ref).
+
+The 'init' keyword argument is a function that takes a kernel as argument and
+sets up an environment for the kernel.
+"""
+function prepare_kernel(kernel::Kernel{F,TT}; init::Function=nop_init_kernel) where {F,TT}
+    # Just call the 'init' function for now.
+    init(kernel)
+end
 
 ## device-side API
 
+# There doesn't seem to be a way to access the documentation for the call-syntax,
+# so attach it to the type
 """
     dynamic_cufunction(f, tt=Tuple{})
 
@@ -493,3 +512,8 @@ function nearest_warpsize(dev::CuDevice, threads::Integer)
     ws = CUDAdrv.warpsize(dev)
     return threads + (ws - threads % ws) % ws
 end
+
+function nop_init_kernel(kernel::Kernel{F,TT}) where {F,TT}
+    # Do nothing.
+    return
+end
\ No newline at end of file

From 5426bece9e42aae80f0a3626189d2bb317a5ae5d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 1 Mar 2019 18:13:00 +0100
Subject: [PATCH 009/146] Add an example that initializes a kernel global

---
 examples/global-data.jl | 77 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 examples/global-data.jl

diff --git a/examples/global-data.jl b/examples/global-data.jl
new file mode 100644
index 00000000..2939612a
--- /dev/null
+++ b/examples/global-data.jl
@@ -0,0 +1,77 @@
+using CUDAdrv, CUDAnative, LLVM, LLVM.Interop
+using Test
+
+# This example shows that CUDAnative kernels can include global
+# data, which may be set by the host.
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function get_global_pointer(::Val{global_name}, ::Type{T}) where {global_name, T}
+    T_global = convert(LLVMType, T)
+    T_result = convert(LLVMType, Ptr{T})
+
+    # Create a thunk that computes a pointer to the global.
+    llvm_f, _ = create_function(T_result)
+    mod = LLVM.parent(llvm_f)
+
+    # Figure out if the global has been defined already.
+    globalSet = LLVM.globals(mod)
+    global_name_string = String(global_name)
+    if haskey(globalSet, global_name_string)
+        global_var = globalSet[global_name_string]
+    else
+        # If the global hasn't been defined already, then we'll define
+        # it in the global address space, i.e., address space one.
+        global_var = GlobalVariable(mod, T_global, global_name_string, 1)
+        LLVM.initializer!(global_var, LLVM.null(T_global))
+    end
+
+    # Generate IR that computes the global's address.
+    Builder(JuliaContext()) do builder
+        entry = BasicBlock(llvm_f, "entry", JuliaContext())
+        position!(builder, entry)
+
+        # Cast the global variable's type to the result type.
+        result = ptrtoint!(builder, global_var, T_result)
+        ret!(builder, result)
+    end
+
+    # Call the function.
+    call_function(llvm_f, Ptr{T})
+end
+
+macro cuda_global_ptr(name, type)
+    return :(get_global_pointer(
+        $(Val(Symbol(name))),
+        $(esc(type))))
+end
+
+# Define a kernel that copies the global's value into an array.
+function kernel(a::CUDAnative.DevicePtr{Float32})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+
+    ptr = @cuda_global_ptr("test_global", Float32)
+    Base.unsafe_store!(a, Base.unsafe_load(ptr), i)
+    return
+end
+
+magic = 42.f0
+
+# Define a kernel initialization function that sets the global
+# to the magic value.
+function kernel_init(kernel)
+    global_handle = CuGlobal{Float32}(kernel.mod, "test_global")
+    set(global_handle, magic)
+end
+
+# Allocate a buffer on the GPU.
+len = 12
+d_a = Mem.alloc(Float32, len)
+ptr = Base.unsafe_convert(CuPtr{Float32}, d_a)
+
+# Run the kernel.
+@cuda threads=len init=kernel_init kernel(ptr)
+
+# Test that the buffer has indeed been filled with the magic value.
+@test Mem.download(Float32, d_a, len) == repeat([magic], len)

From 537bfca209dbe9353cdc9bd0ce32a5f68a968b8d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 1 Mar 2019 19:18:03 +0100
Subject: [PATCH 010/146] Include an atomic cmpxchg example

---
 examples/atomic-exchange.jl | 95 +++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 examples/atomic-exchange.jl

diff --git a/examples/atomic-exchange.jl b/examples/atomic-exchange.jl
new file mode 100644
index 00000000..f200022d
--- /dev/null
+++ b/examples/atomic-exchange.jl
@@ -0,0 +1,95 @@
+using CUDAdrv, CUDAnative, CUDAatomics, LLVM, LLVM.Interop
+using Test
+
+# This example shows that it is possible to use LLVM's atomic compare
+# and exchange instructions from CUDAnative kernels.
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function atomic_compare_exchange!(ptr::TPtr, cmp::T, new::T) where {TPtr,T}
+    T_ptr = convert(LLVMType, TPtr)
+    T_val = convert(LLVMType, T)
+
+    # Create a thunk that performs the compare and exchange.
+    llvm_f, _ = create_function(T_val, [T_ptr, T_val, T_val])
+    mod = LLVM.parent(llvm_f)
+
+    # Generate IR for the thunk.
+    Builder(JuliaContext()) do builder
+        entry = BasicBlock(llvm_f, "entry", JuliaContext())
+        position!(builder, entry)
+
+        # Cast the pointer to an actual pointer.
+        ptr_val = parameters(llvm_f)[1]
+        if !isa(ptr_val, LLVM.PointerType)
+            ptr_val = inttoptr!(
+                builder,
+                ptr_val,
+                LLVM.PointerType(T_val))
+        end
+
+        # Perform an atomic compare and exchange.
+        # TODO: find a way to express the sequential consistency ordering
+        # that is less brittle than `UInt32(7)`.
+        seq_cst = UInt32(7)
+        cmpxchg_val = atomic_cmpxchg!(
+            builder,
+            ptr_val,
+            parameters(llvm_f)[2],
+            parameters(llvm_f)[3],
+            seq_cst,
+            seq_cst,
+            false)
+
+        result = extract_value!(builder, cmpxchg_val, 0)
+        ret!(builder, result)
+    end
+
+    # Call the function.
+    call_function(llvm_f, T, Tuple{TPtr, T, T}, :((ptr, cmp, new)))
+end
+
+# A store that is implemented using an atomic compare and exchange.
+# This is overkill as a store implementation, but it shows that
+# atomic compare and exchange works.
+function wacky_store!(ptr::CUDAnative.DevicePtr{T}, val::T, index::Integer) where T
+    atomic_compare_exchange!(
+        ptr + (index - 1) * sizeof(T),
+        unsafe_load(ptr, index),
+        val)
+end
+
+# A kernel that swaps the contents of two buffers using atomic compare
+# and exchange instructions.
+function vswap(a::CUDAnative.DevicePtr{UInt32}, b::CUDAnative.DevicePtr{UInt32})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    a_val = unsafe_load(a, i)
+    b_val = unsafe_load(b, i)
+    wacky_store!(b, a_val, i)
+    wacky_store!(a, b_val, i)
+    return
+end
+
+# Decide on buffer dimensions.
+dims = (12,)
+len = prod(dims)
+
+# Fill two buffers with random garbage.
+a = UInt32.(round.(rand(Float32, dims) * 100))
+b = UInt32.(round.(rand(Float32, dims) * 100))
+
+# Allocate buffers on the GPU.
+d_a = Mem.alloc(UInt32, len)
+Mem.upload!(d_a, a)
+a_ptr = Base.unsafe_convert(CuPtr{UInt32}, d_a)
+d_b = Mem.alloc(UInt32, len)
+Mem.upload!(d_b, b)
+b_ptr = Base.unsafe_convert(CuPtr{UInt32}, d_b)
+
+# Run the kernel.
+@cuda threads=len vswap(a_ptr, b_ptr)
+
+# Test that the buffers have indeed been swapped.
+@test Mem.download(UInt32, d_a, len) == b
+@test Mem.download(UInt32, d_b, len) == a

From 614d04b6b55569f24ff29894a7e2ea0503e0837d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 4 Mar 2019 13:32:06 +0100
Subject: [PATCH 011/146] Create a fully-featured interrupt example

---
 examples/interrupt.jl | 297 ++++++++++++++++++++++++++++++++++++++++++
 src/execution.jl      |   1 +
 2 files changed, 298 insertions(+)
 create mode 100644 examples/interrupt.jl

diff --git a/examples/interrupt.jl b/examples/interrupt.jl
new file mode 100644
index 00000000..1c264900
--- /dev/null
+++ b/examples/interrupt.jl
@@ -0,0 +1,297 @@
+using CUDAdrv, CUDAnative, LLVM, LLVM.Interop
+import CUDAdrv: @apicall
+using Test
+
+# Allocates an array of host memory that is page-locked and accessible
+# to the device. Maps the allocation into the CUDA address space.
+# Returns a (host array, device buffer) pair. The former can be used by
+# the host to access the array, the latter can be used by the device.
+function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
+    # Allocate memory that is accessible to both the host and the device.
+    bytesize = prod(dims) * sizeof(T)
+    ptr_ref = Ref{Ptr{Cvoid}}()
+    @apicall(
+        :cuMemAllocHost,
+        (Ptr{Ptr{Cvoid}}, Csize_t),
+        ptr_ref, bytesize)
+
+    device_buffer = CUDAdrv.Mem.Buffer(convert(CuPtr{T}, convert(Csize_t, ptr_ref[])), bytesize, CuCurrentContext())
+
+    # Wrap the memory in an array for the host.
+    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false)
+
+    # Initialize the array's contents.
+    fill!(host_array, init)
+
+    return host_array, device_buffer
+end
+
+# Queries a stream for its status.
+function query_stream(stream::CUDAdrv.CuStream_t = C_NULL)::Cint
+    return ccall(
+        (:cuStreamQuery, CUDAdrv.libcuda),
+        Cint,
+        (CUDAdrv.CuStream_t,),
+        stream)
+end
+
+# This example shows that it is possible to use LLVM's atomic compare
+# and exchange instructions from CUDAnative kernels.
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function atomic_compare_exchange!(ptr::TPtr, cmp::T, new::T)::T where {TPtr,T}
+    ptr_type = convert(LLVMType, TPtr)
+    lt = string(convert(LLVMType, T))
+    if isa(ptr_type, LLVM.PointerType)
+        ir = """
+            %result = cmpxchg volatile $lt* %0, $lt %1, $lt %2 seq_cst seq_cst
+            %rv = extractvalue { $lt, i1 } %result, 0
+            ret $lt %rv
+            """
+    else
+        ir = """
+            %ptr = inttoptr $ptr_type %0 to $lt*
+            %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst
+            %rv = extractvalue { $lt, i1 } %result, 0
+            ret $lt %rv
+            """
+    end
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new))
+end
+
+# Loads a value from a pointer.
+@generated function volatile_load(ptr::Ptr{T})::T where T
+    ptr_type = string(convert(LLVMType, Ptr{T}))
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        %rv = load volatile $lt, $lt* %ptr
+        ret $lt %rv
+        """
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr))
+end
+
+# Stores a value at a particular address.
+@generated function volatile_store!(ptr::Ptr{T}, value::T) where T
+    ptr_type = string(convert(LLVMType, Ptr{T}))
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        store volatile $lt %1, $lt* %ptr
+        ret void
+        """
+    :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value))
+end
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T}
+    T_global = convert(LLVMType, T)
+    T_result = convert(LLVMType, Ptr{T})
+
+    # Create a thunk that computes a pointer to the global.
+    llvm_f, _ = create_function(T_result)
+    mod = LLVM.parent(llvm_f)
+
+    # Figure out if the global has been defined already.
+    globalSet = LLVM.globals(mod)
+    global_name_string = String(global_name)
+    if haskey(globalSet, global_name_string)
+        global_var = globalSet[global_name_string]
+    else
+        # If the global hasn't been defined already, then we'll define
+        # it in the global address space, i.e., address space one.
+        global_var = GlobalVariable(mod, T_global, global_name_string, 1)
+        LLVM.initializer!(global_var, LLVM.null(T_global))
+    end
+
+    # Generate IR that computes the global's address.
+    Builder(JuliaContext()) do builder
+        entry = BasicBlock(llvm_f, "entry", JuliaContext())
+        position!(builder, entry)
+
+        # Cast the global variable's type to the result type.
+        result = ptrtoint!(builder, global_var, T_result)
+        ret!(builder, result)
+    end
+
+    # Call the function.
+    call_function(llvm_f, Ptr{T})
+end
+
+macro cuda_global_ptr(name, type)
+    return :(get_global_pointer(
+        $(Val(Symbol(name))),
+        $(esc(type))))
+end
+
+# Gets a pointer to the interrupt region.
+@inline function get_interrupt_pointer()::Ptr{UInt32}
+    # Compute a pointer to the global in which a pointer to the
+    # interrupt state is stored.
+    ptr = @cuda_global_ptr("interrupt_pointer", Ptr{UInt32})
+    # state the pointer, netting us a pointer to the interrupt
+    # region.
+    return Base.unsafe_load(ptr)
+end
+
+# The interrupt state is a 32-bit unsigned integer that
+# can have one of the following values:
+#
+#   * 0: host is ready to process an interrupt, no interrupt
+#        is currently being processed.
+#   * 1: device has requested an interrupt, the interrupt
+#        has not completed processing yet.
+#
+const ready = UInt32(0)
+const processing = UInt32(1)
+
+# Requests an interrupt and waits until the interrupt
+# completes. If an interrupt is already running, then
+# nothing happens. Returns `true` if an interrupt was
+# successfully started by this function; otherwise,
+# `false`.
+function interrupt_or_wait()::Bool
+    state_ptr = get_interrupt_pointer()
+    prev_state = atomic_compare_exchange!(state_ptr, ready, processing)
+    wait_for_interrupt()
+    return prev_state == ready
+end
+
+# Waits for the current interrupt to finish, if an
+# interrupt is currently running.
+function wait_for_interrupt()
+    state_ptr = get_interrupt_pointer()
+    while volatile_load(state_ptr) == processing
+    end
+end
+
+# Repeatedly requests an interrupt until one is requested
+# successfully.
+function interrupt()
+    while !interrupt_or_wait()
+    end
+end
+
+# Waits for the current kernel to terminate and handle
+# any interrupts that we encounter along the way.
+function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CUDAdrv.CuStream_t = C_NULL)
+    while true
+        # Sleep to save processing power.
+        sleep(0.001)
+
+        # Query the CUDA stream.
+        status = query_stream(stream)
+        if status == CUDAdrv.SUCCESS.code
+            # The kernel has finished running. We're done here.
+            return
+        elseif status == CUDAdrv.ERROR_NOT_READY.code
+            # The kernel is still running. Check if an interrupt
+            # needs handling.
+            if volatile_load(state) == processing
+                # Run the handler.
+                handler()
+                # Set the interrupt state to 'ready'.
+                volatile_store!(state, ready)
+            end
+
+            # Continue querying the stream.
+        else
+            # Whoa. Something both unexpected and unpleasant seems
+            # to have happened. Better throw an exception here.
+            throw(CuError(status))
+        end
+    end
+end
+
+"""
+    @cuda_interruptible [kwargs...] func(args...)
+
+High-level interface for executing code on a GPU with support for interrups.
+The `@cuda_interruptible` macro should prefix a call, with `func` a callable function
+or object that should return nothing. It will be compiled to a CUDA function upon first
+use, and to a certain extent arguments will be converted and anaged automatically using
+`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel
+launch on the current CUDA context.
+
+Several keyword arguments are supported that influence kernel compilation and execution. For
+more information, refer to the documentation of respectively [`cufunction`](@ref) and
+[`CUDAnative.Kernel`](@ref).
+"""
+macro cuda_interruptible(handler, ex...)
+    # destructure the `@cuda_interruptible` expression
+    if length(ex) > 0 && ex[1].head == :tuple
+        error("The tuple argument to @cuda has been replaced by keywords: `@cuda_interruptible threads=... fun(args...)`")
+    end
+    call = ex[end]
+    kwargs = ex[1:end-1]
+
+    # destructure the kernel call
+    if call.head != :call
+        throw(ArgumentError("second argument to @cuda_interruptible should be a function call"))
+    end
+    f = call.args[1]
+    args = call.args[2:end]
+
+    code = quote end
+    compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs)
+    vars, var_exprs = CUDAnative.assign_args!(code, args)
+
+    # convert the arguments, call the compiler and launch the kernel
+    # while keeping the original arguments alive
+    push!(code.args,
+        quote
+            GC.@preserve $(vars...) begin
+                # Define a trivial buffer that contains the interrupt state.
+                local host_array, device_buffer = alloc_shared_array((1,), ready)
+
+                # Define a kernel initialization function that sets the
+                # interrupt state pointer.
+                local function interrupt_kernel_init(kernel)
+                    try
+                        global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
+                        set(global_handle, CuPtr{UInt32}(device_buffer.ptr))
+                    catch exception
+                        # The interrupt pointer may not have been declared (because it is unused).
+                        # In that case, we should do nothing.
+                        if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+                            rethrow()
+                        end
+                    end
+                end
+
+                # Standard kernel setup logic.
+                local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
+                local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
+                local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
+                CUDAnative.prepare_kernel(kernel; init=interrupt_kernel_init, $(map(esc, env_kwargs)...))
+                kernel(kernel_args...; $(map(esc, call_kwargs)...))
+
+                # Handle interrupts.
+                handle_interrupts($(esc(handler)), pointer(host_array, 1))
+            end
+         end)
+    return code
+end
+
+# Define a kernel that invokes the host to do some work.
+function kernel()
+    interrupt()
+    return
+end
+
+thread_count = 64
+
+# Run the kernel.
+global counter = 0
+function handle_interrupt()
+    global counter
+    counter += 1
+end
+
+@cuda_interruptible handle_interrupt threads=thread_count kernel()
+
+@test counter == thread_count
diff --git a/src/execution.jl b/src/execution.jl
index 10f9faa9..ecfbefe5 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -104,6 +104,7 @@ kernel to determine the launch configuration. A host-side kernel launch is done
         kernel_args = cudaconvert.(args)
         kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
         kernel = cufunction(f, kernel_tt; compilation_kwargs)
+        prepare_kernel(kernel; environment_kwargs)
         kernel(kernel_args...; launch_kwargs)
     end
 

From 6ed1acf7d51f1fbe2c825b3f2ead0bbd95582e1b Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 4 Mar 2019 14:01:07 +0100
Subject: [PATCH 012/146] Update interrupt example to include memory transfer
 during interrupts

---
 examples/interrupt.jl | 54 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/examples/interrupt.jl b/examples/interrupt.jl
index 1c264900..564f77a7 100644
--- a/examples/interrupt.jl
+++ b/examples/interrupt.jl
@@ -27,7 +27,7 @@ function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
 end
 
 # Queries a stream for its status.
-function query_stream(stream::CUDAdrv.CuStream_t = C_NULL)::Cint
+function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint
     return ccall(
         (:cuStreamQuery, CUDAdrv.libcuda),
         Cint,
@@ -178,7 +178,7 @@ end
 
 # Waits for the current kernel to terminate and handle
 # any interrupts that we encounter along the way.
-function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CUDAdrv.CuStream_t = C_NULL)
+function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CuStream = CuDefaultStream())
     while true
         # Sleep to save processing power.
         sleep(0.001)
@@ -240,6 +240,15 @@ macro cuda_interruptible(handler, ex...)
     compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs)
     vars, var_exprs = CUDAnative.assign_args!(code, args)
 
+    # Find the stream on which the kernel is to be scheduled.
+    stream = CuDefaultStream()
+    for kwarg in call_kwargs
+        key, val = kwarg.args
+        if key == :stream
+            stream = val
+        end
+    end
+
     # convert the arguments, call the compiler and launch the kernel
     # while keeping the original arguments alive
     push!(code.args,
@@ -271,27 +280,50 @@ macro cuda_interruptible(handler, ex...)
                 kernel(kernel_args...; $(map(esc, call_kwargs)...))
 
                 # Handle interrupts.
-                handle_interrupts($(esc(handler)), pointer(host_array, 1))
+                handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream)))
             end
          end)
     return code
 end
 
-# Define a kernel that invokes the host to do some work.
-function kernel()
+# Define a kernel that copies some data from one array to another.
+# The host is invoked to populate the source array.
+function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
     interrupt()
+    threadfence_system()
+    Base.unsafe_store!(b, Base.unsafe_load(a, i), i)
     return
 end
 
 thread_count = 64
 
-# Run the kernel.
-global counter = 0
+# Allocate two arrays.
+source_array = Mem.alloc(Float32, thread_count)
+destination_array = Mem.alloc(Float32, thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
+
+# Zero-fill the source and destination arrays.
+Mem.upload!(source_array, zeros(Float32, thread_count))
+Mem.upload!(destination_array, zeros(Float32, thread_count))
+
+# Define one stream for kernel execution and another for
+# data transfer.
+data_stream = CuStream()
+exec_stream = CuStream()
+
+# Define a magic value.
+magic = 42.f0
+
+# Configure the interrupt to fill the input array with the magic value.
 function handle_interrupt()
-    global counter
-    counter += 1
+    Mem.upload!(source_array, fill(magic, thread_count), data_stream; async = true)
+    synchronize(data_stream)
 end
 
-@cuda_interruptible handle_interrupt threads=thread_count kernel()
+# Run the kernel.
+@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer)
 
-@test counter == thread_count
+# Check that the destination buffer is as expected.
+@test Mem.download(Float32, destination_array, thread_count) == fill(magic, thread_count)

From d960a9164b6363dbad631c52ebd03cdd9bdf6d54 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 4 Mar 2019 14:50:53 +0100
Subject: [PATCH 013/146] Define a high-level interrupt interface

---
 src/CUDAnative.jl |   1 +
 src/interrupts.jl | 296 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 297 insertions(+)
 create mode 100644 src/interrupts.jl

diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index fc0bbb60..30bfa9c1 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -34,6 +34,7 @@ include(joinpath("device", "runtime.jl"))
 
 include("compiler.jl")
 include("execution.jl")
+include("interrupts.jl")
 include("reflection.jl")
 
 include("deprecated.jl")
diff --git a/src/interrupts.jl b/src/interrupts.jl
new file mode 100644
index 00000000..303bb209
--- /dev/null
+++ b/src/interrupts.jl
@@ -0,0 +1,296 @@
+# This file implements a high-level generic device-to-host interrupt
+# mechanism. This file also contains non-trivial support infrastructure
+# that should either be moved to CUDAdrv or exposed by CUDAnative.
+# Note that this support infrastructure is not exported, so it remains
+# an implementation detail as opposed to a part of CUDAnative's public
+# API.
+
+import CUDAdrv: @apicall
+
+export @cuda_interruptible, interrupt, interrupt_or_wait, wait_for_interrupt
+
+# Allocates an array of host memory that is page-locked and accessible
+# to the device. Maps the allocation into the CUDA address space.
+# Returns a (host array, device buffer) pair. The former can be used by
+# the host to access the array, the latter can be used by the device.
+function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
+    # Allocate memory that is accessible to both the host and the device.
+    bytesize = prod(dims) * sizeof(T)
+    ptr_ref = Ref{Ptr{Cvoid}}()
+    @apicall(
+        :cuMemAllocHost,
+        (Ptr{Ptr{Cvoid}}, Csize_t),
+        ptr_ref, bytesize)
+
+    device_buffer = CUDAdrv.Mem.Buffer(convert(CuPtr{T}, convert(Csize_t, ptr_ref[])), bytesize, CuCurrentContext())
+
+    # Wrap the memory in an array for the host.
+    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false)
+
+    # Initialize the array's contents.
+    fill!(host_array, init)
+
+    return host_array, device_buffer
+end
+
+# Frees an array of host memory.
+function free_shared_array(buffer::Mem.Buffer)
+    ptr = convert(Ptr{Cvoid}, convert(Csize_t, buffer.ptr))
+    @apicall(
+        :cuMemFreeHost,
+        (Ptr{Cvoid},),
+        ptr)
+end
+
+# Queries a stream for its status.
+function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint
+    return ccall(
+        (:cuStreamQuery, CUDAdrv.libcuda),
+        Cint,
+        (CUDAdrv.CuStream_t,),
+        stream)
+end
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function atomic_compare_exchange!(ptr::Ptr{T}, cmp::T, new::T)::T where T
+    ptr_type = convert(LLVMType, Ptr{T})
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst
+        %rv = extractvalue { $lt, i1 } %result, 0
+        ret $lt %rv
+        """
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new))
+end
+
+# Loads a value from a pointer.
+@generated function volatile_load(ptr::Ptr{T})::T where T
+    ptr_type = string(convert(LLVMType, Ptr{T}))
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        %rv = load volatile $lt, $lt* %ptr
+        ret $lt %rv
+        """
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr))
+end
+
+# Stores a value at a particular address.
+@generated function volatile_store!(ptr::Ptr{T}, value::T) where T
+    ptr_type = string(convert(LLVMType, Ptr{T}))
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        store volatile $lt %1, $lt* %ptr
+        ret void
+        """
+    :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value))
+end
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T}
+    T_global = convert(LLVMType, T)
+    T_result = convert(LLVMType, Ptr{T})
+
+    # Create a thunk that computes a pointer to the global.
+    llvm_f, _ = create_function(T_result)
+    mod = LLVM.parent(llvm_f)
+
+    # Figure out if the global has been defined already.
+    globalSet = LLVM.globals(mod)
+    global_name_string = String(global_name)
+    if haskey(globalSet, global_name_string)
+        global_var = globalSet[global_name_string]
+    else
+        # If the global hasn't been defined already, then we'll define
+        # it in the global address space, i.e., address space one.
+        global_var = GlobalVariable(mod, T_global, global_name_string, 1)
+        LLVM.initializer!(global_var, LLVM.null(T_global))
+    end
+
+    # Generate IR that computes the global's address.
+    Builder(JuliaContext()) do builder
+        entry = BasicBlock(llvm_f, "entry", JuliaContext())
+        position!(builder, entry)
+
+        # Cast the global variable's type to the result type.
+        result = ptrtoint!(builder, global_var, T_result)
+        ret!(builder, result)
+    end
+
+    # Call the function.
+    call_function(llvm_f, Ptr{T})
+end
+
+macro cuda_global_ptr(name, type)
+    return :(get_global_pointer(
+        $(Val(Symbol(name))),
+        $(esc(type))))
+end
+
+# Gets a pointer to the interrupt region.
+@inline function get_interrupt_pointer()::Ptr{UInt32}
+    # Compute a pointer to the global in which a pointer to the
+    # interrupt state is stored.
+    ptr = @cuda_global_ptr("interrupt_pointer", Ptr{UInt32})
+    # state the pointer, netting us a pointer to the interrupt
+    # region.
+    return Base.unsafe_load(ptr)
+end
+
+# The interrupt state is a 32-bit unsigned integer that
+# can have one of the following values:
+#
+#   * 0: host is ready to process an interrupt, no interrupt
+#        is currently being processed.
+#   * 1: device has requested an interrupt, the interrupt
+#        has not completed processing yet.
+#
+const ready = UInt32(0)
+const processing = UInt32(1)
+
+# Requests an interrupt and waits until the interrupt
+# completes. If an interrupt is already running, then
+# nothing happens. Returns `true` if an interrupt was
+# successfully started by this function; otherwise,
+# `false`.
+function interrupt_or_wait()::Bool
+    state_ptr = get_interrupt_pointer()
+    prev_state = atomic_compare_exchange!(state_ptr, ready, processing)
+    wait_for_interrupt()
+    return prev_state == ready
+end
+
+# Waits for the current interrupt to finish, if an
+# interrupt is currently running.
+function wait_for_interrupt()
+    state_ptr = get_interrupt_pointer()
+    while volatile_load(state_ptr) == processing
+    end
+end
+
+# Repeatedly requests an interrupt until one is requested
+# successfully.
+function interrupt()
+    while !interrupt_or_wait()
+    end
+end
+
+# Waits for the current kernel to terminate and handle
+# any interrupts that we encounter along the way.
+function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CuStream = CuDefaultStream())
+    while true
+        # Sleep to save processing power.
+        sleep(0.001)
+
+        # Query the CUDA stream.
+        status = query_stream(stream)
+        if status == CUDAdrv.SUCCESS.code
+            # The kernel has finished running. We're done here.
+            return
+        elseif status == CUDAdrv.ERROR_NOT_READY.code
+            # The kernel is still running. Check if an interrupt
+            # needs handling.
+            if volatile_load(state) == processing
+                # Run the handler.
+                handler()
+                # Set the interrupt state to 'ready'.
+                volatile_store!(state, ready)
+            end
+
+            # Continue querying the stream.
+        else
+            # Whoa. Something both unexpected and unpleasant seems
+            # to have happened. Better throw an exception here.
+            throw(CuError(status))
+        end
+    end
+end
+
+"""
+    @cuda_interruptible [kwargs...] func(args...)
+
+High-level interface for executing code on a GPU with support for interrups.
+The `@cuda_interruptible` macro should prefix a call, with `func` a callable function
+or object that should return nothing. It will be compiled to a CUDA function upon first
+use, and to a certain extent arguments will be converted and anaged automatically using
+`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel
+launch on the current CUDA context.
+
+Several keyword arguments are supported that influence kernel compilation and execution. For
+more information, refer to the documentation of respectively [`cufunction`](@ref) and
+[`CUDAnative.Kernel`](@ref).
+"""
+macro cuda_interruptible(handler, ex...)
+    # destructure the `@cuda_interruptible` expression
+    if length(ex) > 0 && ex[1].head == :tuple
+        error("The tuple argument to @cuda has been replaced by keywords: `@cuda_interruptible handler threads=... fun(args...)`")
+    end
+    call = ex[end]
+    kwargs = ex[1:end-1]
+
+    # destructure the kernel call
+    if call.head != :call
+        throw(ArgumentError("second argument to @cuda_interruptible should be a function call"))
+    end
+    f = call.args[1]
+    args = call.args[2:end]
+
+    code = quote end
+    compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs)
+    vars, var_exprs = CUDAnative.assign_args!(code, args)
+
+    # Find the stream on which the kernel is to be scheduled.
+    stream = CuDefaultStream()
+    for kwarg in call_kwargs
+        key, val = kwarg.args
+        if key == :stream
+            stream = val
+        end
+    end
+
+    # convert the arguments, call the compiler and launch the kernel
+    # while keeping the original arguments alive
+    push!(code.args,
+        quote
+            GC.@preserve $(vars...) begin
+                # Define a trivial buffer that contains the interrupt state.
+                local host_array, device_buffer = alloc_shared_array((1,), ready)
+
+                try
+                    # Define a kernel initialization function that sets the
+                    # interrupt state pointer.
+                    local function interrupt_kernel_init(kernel)
+                        try
+                            global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
+                            set(global_handle, CuPtr{UInt32}(device_buffer.ptr))
+                        catch exception
+                            # The interrupt pointer may not have been declared (because it is unused).
+                            # In that case, we should do nothing.
+                            if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+                                rethrow()
+                            end
+                        end
+                    end
+
+                    # Standard kernel setup logic.
+                    local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
+                    local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
+                    local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
+                    CUDAnative.prepare_kernel(kernel; init=interrupt_kernel_init, $(map(esc, env_kwargs)...))
+                    kernel(kernel_args...; $(map(esc, call_kwargs)...))
+
+                    # Handle interrupts.
+                    handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream)))
+                finally
+                    free_shared_array(device_buffer)
+                end
+            end
+         end)
+    return code
+end

From 7c627c09aa5787f2049ecf64ad6c6a8484b994fc Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 4 Mar 2019 14:51:01 +0100
Subject: [PATCH 014/146] Refactor interrupt examples

---
 examples/interrupt-memory.jl |  44 +++++
 examples/interrupt.jl        | 324 +----------------------------------
 2 files changed, 53 insertions(+), 315 deletions(-)
 create mode 100644 examples/interrupt-memory.jl

diff --git a/examples/interrupt-memory.jl b/examples/interrupt-memory.jl
new file mode 100644
index 00000000..ac68e622
--- /dev/null
+++ b/examples/interrupt-memory.jl
@@ -0,0 +1,44 @@
+using CUDAdrv, CUDAnative
+using Test
+
+# Define a kernel that copies some data from one array to another.
+# The host is invoked to populate the source array.
+function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    interrupt_or_wait()
+    threadfence_system()
+    Base.unsafe_store!(b, Base.unsafe_load(a, i), i)
+    return
+end
+
+thread_count = 64
+
+# Allocate two arrays.
+source_array = Mem.alloc(Float32, thread_count)
+destination_array = Mem.alloc(Float32, thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
+
+# Zero-fill the source and destination arrays.
+Mem.upload!(source_array, zeros(Float32, thread_count))
+Mem.upload!(destination_array, zeros(Float32, thread_count))
+
+# Define one stream for kernel execution and another for
+# data transfer.
+data_stream = CuStream()
+exec_stream = CuStream()
+
+# Define a magic value.
+magic = 42.f0
+
+# Configure the interrupt to fill the input array with the magic value.
+function handle_interrupt()
+    Mem.upload!(source_array, fill(magic, thread_count), data_stream; async = true)
+    synchronize(data_stream)
+end
+
+# Run the kernel.
+@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer)
+
+# Check that the destination buffer is as expected.
+@test Mem.download(Float32, destination_array, thread_count) == fill(magic, thread_count)
diff --git a/examples/interrupt.jl b/examples/interrupt.jl
index 564f77a7..fd5bf155 100644
--- a/examples/interrupt.jl
+++ b/examples/interrupt.jl
@@ -1,329 +1,23 @@
-using CUDAdrv, CUDAnative, LLVM, LLVM.Interop
-import CUDAdrv: @apicall
+using CUDAdrv, CUDAnative
 using Test
 
-# Allocates an array of host memory that is page-locked and accessible
-# to the device. Maps the allocation into the CUDA address space.
-# Returns a (host array, device buffer) pair. The former can be used by
-# the host to access the array, the latter can be used by the device.
-function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
-    # Allocate memory that is accessible to both the host and the device.
-    bytesize = prod(dims) * sizeof(T)
-    ptr_ref = Ref{Ptr{Cvoid}}()
-    @apicall(
-        :cuMemAllocHost,
-        (Ptr{Ptr{Cvoid}}, Csize_t),
-        ptr_ref, bytesize)
-
-    device_buffer = CUDAdrv.Mem.Buffer(convert(CuPtr{T}, convert(Csize_t, ptr_ref[])), bytesize, CuCurrentContext())
-
-    # Wrap the memory in an array for the host.
-    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false)
-
-    # Initialize the array's contents.
-    fill!(host_array, init)
-
-    return host_array, device_buffer
-end
-
-# Queries a stream for its status.
-function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint
-    return ccall(
-        (:cuStreamQuery, CUDAdrv.libcuda),
-        Cint,
-        (CUDAdrv.CuStream_t,),
-        stream)
-end
-
-# This example shows that it is possible to use LLVM's atomic compare
-# and exchange instructions from CUDAnative kernels.
-
-# Gets a pointer to a global with a particular name. If the global
-# does not exist yet, then it is declared in the global memory address
-# space.
-@generated function atomic_compare_exchange!(ptr::TPtr, cmp::T, new::T)::T where {TPtr,T}
-    ptr_type = convert(LLVMType, TPtr)
-    lt = string(convert(LLVMType, T))
-    if isa(ptr_type, LLVM.PointerType)
-        ir = """
-            %result = cmpxchg volatile $lt* %0, $lt %1, $lt %2 seq_cst seq_cst
-            %rv = extractvalue { $lt, i1 } %result, 0
-            ret $lt %rv
-            """
-    else
-        ir = """
-            %ptr = inttoptr $ptr_type %0 to $lt*
-            %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst
-            %rv = extractvalue { $lt, i1 } %result, 0
-            ret $lt %rv
-            """
-    end
-    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new))
-end
-
-# Loads a value from a pointer.
-@generated function volatile_load(ptr::Ptr{T})::T where T
-    ptr_type = string(convert(LLVMType, Ptr{T}))
-    lt = string(convert(LLVMType, T))
-    ir = """
-        %ptr = inttoptr $ptr_type %0 to $lt*
-        %rv = load volatile $lt, $lt* %ptr
-        ret $lt %rv
-        """
-    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr))
-end
-
-# Stores a value at a particular address.
-@generated function volatile_store!(ptr::Ptr{T}, value::T) where T
-    ptr_type = string(convert(LLVMType, Ptr{T}))
-    lt = string(convert(LLVMType, T))
-    ir = """
-        %ptr = inttoptr $ptr_type %0 to $lt*
-        store volatile $lt %1, $lt* %ptr
-        ret void
-        """
-    :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value))
-end
-
-# Gets a pointer to a global with a particular name. If the global
-# does not exist yet, then it is declared in the global memory address
-# space.
-@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T}
-    T_global = convert(LLVMType, T)
-    T_result = convert(LLVMType, Ptr{T})
-
-    # Create a thunk that computes a pointer to the global.
-    llvm_f, _ = create_function(T_result)
-    mod = LLVM.parent(llvm_f)
-
-    # Figure out if the global has been defined already.
-    globalSet = LLVM.globals(mod)
-    global_name_string = String(global_name)
-    if haskey(globalSet, global_name_string)
-        global_var = globalSet[global_name_string]
-    else
-        # If the global hasn't been defined already, then we'll define
-        # it in the global address space, i.e., address space one.
-        global_var = GlobalVariable(mod, T_global, global_name_string, 1)
-        LLVM.initializer!(global_var, LLVM.null(T_global))
-    end
-
-    # Generate IR that computes the global's address.
-    Builder(JuliaContext()) do builder
-        entry = BasicBlock(llvm_f, "entry", JuliaContext())
-        position!(builder, entry)
-
-        # Cast the global variable's type to the result type.
-        result = ptrtoint!(builder, global_var, T_result)
-        ret!(builder, result)
-    end
-
-    # Call the function.
-    call_function(llvm_f, Ptr{T})
-end
-
-macro cuda_global_ptr(name, type)
-    return :(get_global_pointer(
-        $(Val(Symbol(name))),
-        $(esc(type))))
-end
-
-# Gets a pointer to the interrupt region.
-@inline function get_interrupt_pointer()::Ptr{UInt32}
-    # Compute a pointer to the global in which a pointer to the
-    # interrupt state is stored.
-    ptr = @cuda_global_ptr("interrupt_pointer", Ptr{UInt32})
-    # state the pointer, netting us a pointer to the interrupt
-    # region.
-    return Base.unsafe_load(ptr)
-end
-
-# The interrupt state is a 32-bit unsigned integer that
-# can have one of the following values:
-#
-#   * 0: host is ready to process an interrupt, no interrupt
-#        is currently being processed.
-#   * 1: device has requested an interrupt, the interrupt
-#        has not completed processing yet.
-#
-const ready = UInt32(0)
-const processing = UInt32(1)
-
-# Requests an interrupt and waits until the interrupt
-# completes. If an interrupt is already running, then
-# nothing happens. Returns `true` if an interrupt was
-# successfully started by this function; otherwise,
-# `false`.
-function interrupt_or_wait()::Bool
-    state_ptr = get_interrupt_pointer()
-    prev_state = atomic_compare_exchange!(state_ptr, ready, processing)
-    wait_for_interrupt()
-    return prev_state == ready
-end
-
-# Waits for the current interrupt to finish, if an
-# interrupt is currently running.
-function wait_for_interrupt()
-    state_ptr = get_interrupt_pointer()
-    while volatile_load(state_ptr) == processing
-    end
-end
-
-# Repeatedly requests an interrupt until one is requested
-# successfully.
-function interrupt()
-    while !interrupt_or_wait()
-    end
-end
-
-# Waits for the current kernel to terminate and handle
-# any interrupts that we encounter along the way.
-function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CuStream = CuDefaultStream())
-    while true
-        # Sleep to save processing power.
-        sleep(0.001)
-
-        # Query the CUDA stream.
-        status = query_stream(stream)
-        if status == CUDAdrv.SUCCESS.code
-            # The kernel has finished running. We're done here.
-            return
-        elseif status == CUDAdrv.ERROR_NOT_READY.code
-            # The kernel is still running. Check if an interrupt
-            # needs handling.
-            if volatile_load(state) == processing
-                # Run the handler.
-                handler()
-                # Set the interrupt state to 'ready'.
-                volatile_store!(state, ready)
-            end
-
-            # Continue querying the stream.
-        else
-            # Whoa. Something both unexpected and unpleasant seems
-            # to have happened. Better throw an exception here.
-            throw(CuError(status))
-        end
-    end
-end
-
-"""
-    @cuda_interruptible [kwargs...] func(args...)
-
-High-level interface for executing code on a GPU with support for interrups.
-The `@cuda_interruptible` macro should prefix a call, with `func` a callable function
-or object that should return nothing. It will be compiled to a CUDA function upon first
-use, and to a certain extent arguments will be converted and anaged automatically using
-`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel
-launch on the current CUDA context.
-
-Several keyword arguments are supported that influence kernel compilation and execution. For
-more information, refer to the documentation of respectively [`cufunction`](@ref) and
-[`CUDAnative.Kernel`](@ref).
-"""
-macro cuda_interruptible(handler, ex...)
-    # destructure the `@cuda_interruptible` expression
-    if length(ex) > 0 && ex[1].head == :tuple
-        error("The tuple argument to @cuda has been replaced by keywords: `@cuda_interruptible threads=... fun(args...)`")
-    end
-    call = ex[end]
-    kwargs = ex[1:end-1]
-
-    # destructure the kernel call
-    if call.head != :call
-        throw(ArgumentError("second argument to @cuda_interruptible should be a function call"))
-    end
-    f = call.args[1]
-    args = call.args[2:end]
-
-    code = quote end
-    compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs)
-    vars, var_exprs = CUDAnative.assign_args!(code, args)
-
-    # Find the stream on which the kernel is to be scheduled.
-    stream = CuDefaultStream()
-    for kwarg in call_kwargs
-        key, val = kwarg.args
-        if key == :stream
-            stream = val
-        end
-    end
-
-    # convert the arguments, call the compiler and launch the kernel
-    # while keeping the original arguments alive
-    push!(code.args,
-        quote
-            GC.@preserve $(vars...) begin
-                # Define a trivial buffer that contains the interrupt state.
-                local host_array, device_buffer = alloc_shared_array((1,), ready)
-
-                # Define a kernel initialization function that sets the
-                # interrupt state pointer.
-                local function interrupt_kernel_init(kernel)
-                    try
-                        global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
-                        set(global_handle, CuPtr{UInt32}(device_buffer.ptr))
-                    catch exception
-                        # The interrupt pointer may not have been declared (because it is unused).
-                        # In that case, we should do nothing.
-                        if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
-                            rethrow()
-                        end
-                    end
-                end
-
-                # Standard kernel setup logic.
-                local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
-                local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-                local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
-                CUDAnative.prepare_kernel(kernel; init=interrupt_kernel_init, $(map(esc, env_kwargs)...))
-                kernel(kernel_args...; $(map(esc, call_kwargs)...))
-
-                # Handle interrupts.
-                handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream)))
-            end
-         end)
-    return code
-end
-
-# Define a kernel that copies some data from one array to another.
-# The host is invoked to populate the source array.
-function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
-    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+# Define a kernel that makes the host count.
+function kernel()
     interrupt()
-    threadfence_system()
-    Base.unsafe_store!(b, Base.unsafe_load(a, i), i)
     return
 end
 
 thread_count = 64
 
-# Allocate two arrays.
-source_array = Mem.alloc(Float32, thread_count)
-destination_array = Mem.alloc(Float32, thread_count)
-source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
-destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
-
-# Zero-fill the source and destination arrays.
-Mem.upload!(source_array, zeros(Float32, thread_count))
-Mem.upload!(destination_array, zeros(Float32, thread_count))
-
-# Define one stream for kernel execution and another for
-# data transfer.
-data_stream = CuStream()
-exec_stream = CuStream()
-
-# Define a magic value.
-magic = 42.f0
-
-# Configure the interrupt to fill the input array with the magic value.
+# Configure the interrupt to increment a counter.
+global counter = 0
 function handle_interrupt()
-    Mem.upload!(source_array, fill(magic, thread_count), data_stream; async = true)
-    synchronize(data_stream)
+    global counter
+    counter += 1
 end
 
 # Run the kernel.
-@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer)
+@cuda_interruptible handle_interrupt threads=thread_count kernel()
 
 # Check that the destination buffer is as expected.
-@test Mem.download(Float32, destination_array, thread_count) == fill(magic, thread_count)
+@test counter == thread_count

From c45f33df55b40b27370819a2c8caf1ce2749a6ce Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 4 Mar 2019 14:55:51 +0100
Subject: [PATCH 015/146] Document interrupt API

---
 src/interrupts.jl | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/interrupts.jl b/src/interrupts.jl
index 303bb209..2eb6e7b1 100644
--- a/src/interrupts.jl
+++ b/src/interrupts.jl
@@ -154,11 +154,15 @@ end
 const ready = UInt32(0)
 const processing = UInt32(1)
 
-# Requests an interrupt and waits until the interrupt
-# completes. If an interrupt is already running, then
-# nothing happens. Returns `true` if an interrupt was
-# successfully started by this function; otherwise,
-# `false`.
+"""
+    interrupt_or_wait()
+
+Requests an interrupt and waits until the interrupt completes.
+If an interrupt is already running, then this function waits
+for that interrupt to complete, but does not request an interrupt
+of its own. Returns `true` if an interrupt was successfully
+requested by this function; otherwise, `false`.
+"""
 function interrupt_or_wait()::Bool
     state_ptr = get_interrupt_pointer()
     prev_state = atomic_compare_exchange!(state_ptr, ready, processing)
@@ -166,16 +170,23 @@ function interrupt_or_wait()::Bool
     return prev_state == ready
 end
 
-# Waits for the current interrupt to finish, if an
-# interrupt is currently running.
+"""
+    wait_for_interrupt()
+
+Waits for the current interrupt to finish, if an interrupt is
+currently running.
+"""
 function wait_for_interrupt()
     state_ptr = get_interrupt_pointer()
     while volatile_load(state_ptr) == processing
     end
 end
 
-# Repeatedly requests an interrupt until one is requested
-# successfully.
+"""
+    interrupt()
+
+Repeatedly requests an interrupt until one is requested successfully.
+"""
 function interrupt()
     while !interrupt_or_wait()
     end

From 7c6906b065ee265de296dac17ab4cc8ddc362fd5 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 4 Mar 2019 15:19:41 +0100
Subject: [PATCH 016/146] Define interrupt tests

---
 examples/interrupt.jl     |  3 ++-
 test/device/interrupts.jl | 57 +++++++++++++++++++++++++++++++++++++++
 test/runtests.jl          |  1 +
 3 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 test/device/interrupts.jl

diff --git a/examples/interrupt.jl b/examples/interrupt.jl
index fd5bf155..a1c8f81e 100644
--- a/examples/interrupt.jl
+++ b/examples/interrupt.jl
@@ -19,5 +19,6 @@ end
 # Run the kernel.
 @cuda_interruptible handle_interrupt threads=thread_count kernel()
 
-# Check that the destination buffer is as expected.
+# Check that the counter's final value equals the number
+# of threads.
 @test counter == thread_count
diff --git a/test/device/interrupts.jl b/test/device/interrupts.jl
new file mode 100644
index 00000000..0e6d8ab4
--- /dev/null
+++ b/test/device/interrupts.jl
@@ -0,0 +1,57 @@
+@testset "interrupts" begin
+
+############################################################################################
+
+dummy() = return
+
+dummy_handler(kernel) = return
+
+@testset "@cuda_interruptible" begin
+
+@test_throws UndefVarError @cuda_interruptible dummy_handler undefined()
+@test_throws MethodError @cuda_interruptible dummy_handler dummy(1)
+
+@testset "compilation params" begin
+    @cuda_interruptible dummy_handler dummy()
+
+    @test_throws CuError @cuda_interruptible dummy_handler threads=2 maxthreads=1 dummy()
+    @cuda_interruptible dummy_handler threads=2 dummy()
+end
+
+@testset "count" begin
+
+    # This test uses interrupts to increment a host counter and then
+    # checks that the counter's value equals the number of interrupts.
+    # This is a useful thing to check because it verifies that interrupts
+    # are neither skipped nor performed twice.
+    #
+    # We will use a sizeable number of threads (128) to give us a better
+    # shot at detecting concurrency errors, if any. The number of skipped
+    # interrupts is unlikely to equal the number of additional, unwanted
+    # interrupts for this many threads.
+    thread_count = 128
+
+    # Define a kernel that makes the host count.
+    function increment_counter()
+        interrupt()
+        return
+    end
+
+    # Configure the interrupt to increment a counter.
+    global counter = 0
+    function handle_interrupt()
+        global counter
+        counter += 1
+    end
+
+    # Run the kernel.
+    @cuda_interruptible handle_interrupt threads=thread_count increment_counter()
+
+    # Check that the counter's final value equals the number
+    # of threads.
+    @test counter == thread_count
+end
+
+end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0ca46096..f382330d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -65,6 +65,7 @@ if CUDAnative.configured
         else
             include("device/codegen.jl")
             include("device/execution.jl")
+            include("device/interrupts.jl")
             include("device/pointer.jl")
             include("device/array.jl")
             include("device/cuda.jl")

From 9c73a28da01015ce9b552b008689de3361a55558 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 4 Mar 2019 15:27:43 +0100
Subject: [PATCH 017/146] Add another interrupt test

---
 test/device/interrupts.jl | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/test/device/interrupts.jl b/test/device/interrupts.jl
index 0e6d8ab4..a07a57e8 100644
--- a/test/device/interrupts.jl
+++ b/test/device/interrupts.jl
@@ -19,7 +19,6 @@ dummy_handler(kernel) = return
 end
 
 @testset "count" begin
-
     # This test uses interrupts to increment a host counter and then
     # checks that the counter's value equals the number of interrupts.
     # This is a useful thing to check because it verifies that interrupts
@@ -52,6 +51,37 @@ end
     @test counter == thread_count
 end
 
+@testset "count in stream" begin
+    # This test is a copy of the previous test, but it uses a non-default
+    # CUDA stream. This should Just Work: @cuda_interruptible should
+    # intercept the `stream=...` argument and pass it to the stream-querying
+    # logic. All of this should be entirely transparent to the user.
+    thread_count = 128
+
+    # Define a kernel that makes the host count.
+    function increment_counter()
+        interrupt()
+        return
+    end
+
+    # Configure the interrupt to increment a counter.
+    global counter = 0
+    function handle_interrupt()
+        global counter
+        counter += 1
+    end
+
+    # Define a CUDA stream.
+    exec_stream = CuStream()
+
+    # Run the kernel.
+    @cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream increment_counter()
+
+    # Check that the counter's final value equals the number
+    # of threads.
+    @test counter == thread_count
+end
+
 end
 
 end

From 47439eb7371cd15610d61c38ab157e5ed8ed8178 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 4 Mar 2019 15:34:56 +0100
Subject: [PATCH 018/146] Remove experimental examples

I built these examples mostly as experiments. Their core logic ended up
in 'interrupts.jl', which is cleverly designed to expose a high-level interface.
The examples deleted by this commit are not: they're low-level and kind of
hacky.
---
 examples/atomic-exchange.jl | 95 -------------------------------------
 examples/global-data.jl     | 77 ------------------------------
 examples/host-comm.jl       | 78 ------------------------------
 examples/shared-memory.jl   | 38 ---------------
 4 files changed, 288 deletions(-)
 delete mode 100644 examples/atomic-exchange.jl
 delete mode 100644 examples/global-data.jl
 delete mode 100644 examples/host-comm.jl
 delete mode 100644 examples/shared-memory.jl

diff --git a/examples/atomic-exchange.jl b/examples/atomic-exchange.jl
deleted file mode 100644
index f200022d..00000000
--- a/examples/atomic-exchange.jl
+++ /dev/null
@@ -1,95 +0,0 @@
-using CUDAdrv, CUDAnative, CUDAatomics, LLVM, LLVM.Interop
-using Test
-
-# This example shows that it is possible to use LLVM's atomic compare
-# and exchange instructions from CUDAnative kernels.
-
-# Gets a pointer to a global with a particular name. If the global
-# does not exist yet, then it is declared in the global memory address
-# space.
-@generated function atomic_compare_exchange!(ptr::TPtr, cmp::T, new::T) where {TPtr,T}
-    T_ptr = convert(LLVMType, TPtr)
-    T_val = convert(LLVMType, T)
-
-    # Create a thunk that performs the compare and exchange.
-    llvm_f, _ = create_function(T_val, [T_ptr, T_val, T_val])
-    mod = LLVM.parent(llvm_f)
-
-    # Generate IR for the thunk.
-    Builder(JuliaContext()) do builder
-        entry = BasicBlock(llvm_f, "entry", JuliaContext())
-        position!(builder, entry)
-
-        # Cast the pointer to an actual pointer.
-        ptr_val = parameters(llvm_f)[1]
-        if !isa(ptr_val, LLVM.PointerType)
-            ptr_val = inttoptr!(
-                builder,
-                ptr_val,
-                LLVM.PointerType(T_val))
-        end
-
-        # Perform an atomic compare and exchange.
-        # TODO: find a way to express the sequential consistency ordering
-        # that is less brittle than `UInt32(7)`.
-        seq_cst = UInt32(7)
-        cmpxchg_val = atomic_cmpxchg!(
-            builder,
-            ptr_val,
-            parameters(llvm_f)[2],
-            parameters(llvm_f)[3],
-            seq_cst,
-            seq_cst,
-            false)
-
-        result = extract_value!(builder, cmpxchg_val, 0)
-        ret!(builder, result)
-    end
-
-    # Call the function.
-    call_function(llvm_f, T, Tuple{TPtr, T, T}, :((ptr, cmp, new)))
-end
-
-# A store that is implemented using an atomic compare and exchange.
-# This is overkill as a store implementation, but it shows that
-# atomic compare and exchange works.
-function wacky_store!(ptr::CUDAnative.DevicePtr{T}, val::T, index::Integer) where T
-    atomic_compare_exchange!(
-        ptr + (index - 1) * sizeof(T),
-        unsafe_load(ptr, index),
-        val)
-end
-
-# A kernel that swaps the contents of two buffers using atomic compare
-# and exchange instructions.
-function vswap(a::CUDAnative.DevicePtr{UInt32}, b::CUDAnative.DevicePtr{UInt32})
-    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-    a_val = unsafe_load(a, i)
-    b_val = unsafe_load(b, i)
-    wacky_store!(b, a_val, i)
-    wacky_store!(a, b_val, i)
-    return
-end
-
-# Decide on buffer dimensions.
-dims = (12,)
-len = prod(dims)
-
-# Fill two buffers with random garbage.
-a = UInt32.(round.(rand(Float32, dims) * 100))
-b = UInt32.(round.(rand(Float32, dims) * 100))
-
-# Allocate buffers on the GPU.
-d_a = Mem.alloc(UInt32, len)
-Mem.upload!(d_a, a)
-a_ptr = Base.unsafe_convert(CuPtr{UInt32}, d_a)
-d_b = Mem.alloc(UInt32, len)
-Mem.upload!(d_b, b)
-b_ptr = Base.unsafe_convert(CuPtr{UInt32}, d_b)
-
-# Run the kernel.
-@cuda threads=len vswap(a_ptr, b_ptr)
-
-# Test that the buffers have indeed been swapped.
-@test Mem.download(UInt32, d_a, len) == b
-@test Mem.download(UInt32, d_b, len) == a
diff --git a/examples/global-data.jl b/examples/global-data.jl
deleted file mode 100644
index 2939612a..00000000
--- a/examples/global-data.jl
+++ /dev/null
@@ -1,77 +0,0 @@
-using CUDAdrv, CUDAnative, LLVM, LLVM.Interop
-using Test
-
-# This example shows that CUDAnative kernels can include global
-# data, which may be set by the host.
-
-# Gets a pointer to a global with a particular name. If the global
-# does not exist yet, then it is declared in the global memory address
-# space.
-@generated function get_global_pointer(::Val{global_name}, ::Type{T}) where {global_name, T}
-    T_global = convert(LLVMType, T)
-    T_result = convert(LLVMType, Ptr{T})
-
-    # Create a thunk that computes a pointer to the global.
-    llvm_f, _ = create_function(T_result)
-    mod = LLVM.parent(llvm_f)
-
-    # Figure out if the global has been defined already.
-    globalSet = LLVM.globals(mod)
-    global_name_string = String(global_name)
-    if haskey(globalSet, global_name_string)
-        global_var = globalSet[global_name_string]
-    else
-        # If the global hasn't been defined already, then we'll define
-        # it in the global address space, i.e., address space one.
-        global_var = GlobalVariable(mod, T_global, global_name_string, 1)
-        LLVM.initializer!(global_var, LLVM.null(T_global))
-    end
-
-    # Generate IR that computes the global's address.
-    Builder(JuliaContext()) do builder
-        entry = BasicBlock(llvm_f, "entry", JuliaContext())
-        position!(builder, entry)
-
-        # Cast the global variable's type to the result type.
-        result = ptrtoint!(builder, global_var, T_result)
-        ret!(builder, result)
-    end
-
-    # Call the function.
-    call_function(llvm_f, Ptr{T})
-end
-
-macro cuda_global_ptr(name, type)
-    return :(get_global_pointer(
-        $(Val(Symbol(name))),
-        $(esc(type))))
-end
-
-# Define a kernel that copies the global's value into an array.
-function kernel(a::CUDAnative.DevicePtr{Float32})
-    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-
-    ptr = @cuda_global_ptr("test_global", Float32)
-    Base.unsafe_store!(a, Base.unsafe_load(ptr), i)
-    return
-end
-
-magic = 42.f0
-
-# Define a kernel initialization function that sets the global
-# to the magic value.
-function kernel_init(kernel)
-    global_handle = CuGlobal{Float32}(kernel.mod, "test_global")
-    set(global_handle, magic)
-end
-
-# Allocate a buffer on the GPU.
-len = 12
-d_a = Mem.alloc(Float32, len)
-ptr = Base.unsafe_convert(CuPtr{Float32}, d_a)
-
-# Run the kernel.
-@cuda threads=len init=kernel_init kernel(ptr)
-
-# Test that the buffer has indeed been filled with the magic value.
-@test Mem.download(Float32, d_a, len) == repeat([magic], len)
diff --git a/examples/host-comm.jl b/examples/host-comm.jl
deleted file mode 100644
index 0f33e550..00000000
--- a/examples/host-comm.jl
+++ /dev/null
@@ -1,78 +0,0 @@
-using CUDAdrv, CUDAnative, CuArrays
-import CUDAdrv: @apicall
-using Test
-
-# Allocates an array of host memory that is page-locked and accessible
-# to the device. Maps the allocation into the CUDA address space.
-# Returns a (host array, CuArray) pair. The former can be used by
-# the host to access the array, the latter can be used by the device.
-function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
-    # Allocate memory that is accessible to both the host and the device.
-    bytesize = prod(dims) * sizeof(T)
-    ptr_ref = Ref{Ptr{Cvoid}}()
-    @apicall(
-        :cuMemAllocHost,
-        (Ptr{Ptr{Cvoid}}, Csize_t),
-        ptr_ref, bytesize)
-    device_buffer = CUDAdrv.Mem.Buffer(ptr_ref[], bytesize, CuCurrentContext())
-
-    # Wrap the memory in an array for the host.
-    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false)
-
-    # Initialize the array's contents.
-    fill!(host_array, init)
-
-    return host_array, CuArray{T, N}(device_buffer, dims; own = false)
-end
-
-# This example shows that devices can communicate with the host
-# and vice-versa *during* the execution of a kernel.
-#
-# What happens is, in chronological order:
-#
-#   1. A buffer is zero-initialized by the host.
-#   2. A kernel is started on the device; said kernel
-#      waits for the buffer to become nonzero.
-#   3. The host makes the buffer nonzero.
-#   4. The kernel sets the buffer to a magic value and exits
-#      once the buffer is nonzero.
-#
-
-function spin(a)
-    i = threadIdx().x + blockDim().x * (blockIdx().x-1)
-    # Make sure that 'a[i]' is actually zero when we get started.
-    if a[i] != 0.f0
-        return
-    end
-
-    # We wait for the host to set 'a[i]' to a nonzero value.
-    while true
-        ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
-        if a[i] != 0.f0
-            break
-        end
-    end
-    # Next, we set 'a[i]' to some magic value.
-    a[i] = 42.f0
-    return
-end
-
-# Allocate a shared array.
-dims = (3,4)
-host_array, device_array = alloc_shared_array(dims, 0.f0)
-
-# Launch the kernel.
-@cuda threads=prod(dims) spin(device_array)
-
-# Go to sleep for a few milliseconds, to make sure
-# that the kernel will have started already.
-sleep(0.2)
-
-# Fill the array with ones now to unblock the kernel.
-fill!(host_array, 1.f0)
-
-# Wait for the kernel to exit.
-synchronize()
-
-# Check that the array has been set to the magic value.
-@test host_array == fill(42.f0, dims)
diff --git a/examples/shared-memory.jl b/examples/shared-memory.jl
deleted file mode 100644
index 9b946f73..00000000
--- a/examples/shared-memory.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-using CUDAdrv, CUDAnative, CuArrays
-import CUDAdrv: @apicall
-
-using Test
-
-# Allocates an array of host memory that is page-locked and accessible
-# to the device. Maps the allocation into the CUDA address space.
-# Returns a (host array, CuArray) pair. The former can be used by
-# the host to access the array, the latter can be used by the device.
-function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
-    # Allocate memory that is accessible to both the host and the device.
-    bytesize = prod(dims) * sizeof(T)
-    ptr_ref = Ref{Ptr{Cvoid}}()
-    @apicall(
-        :cuMemAllocHost,
-        (Ptr{Ptr{Cvoid}}, Csize_t),
-        ptr_ref, bytesize)
-    device_buffer = CUDAdrv.Mem.Buffer(ptr_ref[], bytesize, CuCurrentContext())
-
-    # Wrap the memory in an array for the host.
-    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false)
-
-    # Initialize the array's contents.
-    fill!(host_array, init)
-
-    return host_array, CuArray{T, N}(device_buffer, dims; own = false)
-end
-
-# Allocate a shared array.
-dims = (2,4)
-host_array, device_array = alloc_shared_array(dims, Int32(42))
-
-# Write some values to the array.
-host_array[1, 2] = 10
-host_array[2, 1] = 0
-
-# Check that the host's version of the array is the same as the device's.
-@test host_array == Array(device_array)

From 297bedc4b206c42bd12f691ec37e1b1ce9d22743 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 5 Mar 2019 14:09:38 +0100
Subject: [PATCH 019/146] Implement a reader-writer lock

---
 examples/lock.jl         |  31 ++++++++
 src/CUDAnative.jl        |   4 +
 src/device/threading.jl  | 159 +++++++++++++++++++++++++++++++++++++++
 src/interrupts.jl        |  39 ----------
 test/device/threading.jl |  91 ++++++++++++++++++++++
 test/runtests.jl         |   2 +
 6 files changed, 287 insertions(+), 39 deletions(-)
 create mode 100644 examples/lock.jl
 create mode 100644 src/device/threading.jl
 create mode 100644 test/device/threading.jl

diff --git a/examples/lock.jl b/examples/lock.jl
new file mode 100644
index 00000000..b4269a7b
--- /dev/null
+++ b/examples/lock.jl
@@ -0,0 +1,31 @@
+using CUDAdrv, CUDAnative
+using Test
+
+thread_count = 128
+
+# Define a kernel that atomically increments a counter using a lock.
+function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState})
+    lock = ReaderWriterLock(lock_state)
+    writer_locked(lock) do
+        unsafe_store!(counter, unsafe_load(counter) + 1)
+    end
+    return
+end
+
+# Allocate memory for the counter and the lock.
+counter_buf = Mem.alloc(sizeof(Int32))
+Mem.upload!(counter_buf, [Int32(0)])
+counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf)
+
+lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState))
+Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)])
+lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf)
+
+# @device_code_warntype increment_counter(counter_pointer, lock_pointer)
+
+# Run the kernel.
+@cuda threads=thread_count increment_counter(counter_pointer, lock_pointer)
+
+# Check that the counter's final value equals the number
+# of threads.
+@test Mem.download(Int32, counter_buf) == [Int32(thread_count)]
diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index 30bfa9c1..6f18eeb1 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -31,6 +31,10 @@ include(joinpath("device", "array.jl"))
 include(joinpath("device", "cuda.jl"))
 include(joinpath("device", "llvm.jl"))
 include(joinpath("device", "runtime.jl"))
+include(joinpath("device", "libdevice.jl"))
+include(joinpath("device", "cuda_intrinsics.jl"))
+include(joinpath("device", "runtime_intrinsics.jl"))
+include(joinpath("device", "threading.jl"))
 
 include("compiler.jl")
 include("execution.jl")
diff --git a/src/device/threading.jl b/src/device/threading.jl
new file mode 100644
index 00000000..8bbeadf9
--- /dev/null
+++ b/src/device/threading.jl
@@ -0,0 +1,159 @@
+# This file implements threading primitives that work for CUDAnative kernels.
+
+export ReaderWriterLock, reader_locked, writer_locked
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function atomic_compare_exchange!(ptr::Ptr{T}, cmp::T, new::T)::T where T
+    ptr_type = convert(LLVMType, Ptr{T})
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst
+        %rv = extractvalue { $lt, i1 } %result, 0
+        ret $lt %rv
+        """
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new))
+end
+
+# Atomically adds a value to a variable pointed to by a pointer.
+# Returns the previous value stored in that value.
+@generated function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T
+    ptr_type = convert(LLVMType, Ptr{T})
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        %rv = atomicrmw volatile add $lt* %ptr, $lt %1 seq_cst
+        ret $lt %rv
+        """
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T}, lhs, rhs))
+end
+
+# Loads a value from a pointer.
+@generated function volatile_load(ptr::Ptr{T})::T where T
+    ptr_type = string(convert(LLVMType, Ptr{T}))
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        %rv = load volatile $lt, $lt* %ptr
+        ret $lt %rv
+        """
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr))
+end
+
+# Stores a value at a particular address.
+@generated function volatile_store!(ptr::Ptr{T}, value::T) where T
+    ptr_type = string(convert(LLVMType, Ptr{T}))
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        store volatile $lt %1, $lt* %ptr
+        ret void
+        """
+    :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value))
+end
+
+const ReaderWriterLockState = Int64
+
+"""
+A reader-writer lock: a lock that supports concurrent access for
+read operations and exclusive access for write operations.
+"""
+struct ReaderWriterLock
+    # A pointer to the reader-writer lock's state. The state
+    # is a counter that can be in one of the following states:
+    #
+    #   * > 0: the lock is acquired by one or more readers.
+    #          The state counter describes the number of readers
+    #          that have acquired the lock.
+    #
+    #   * = 0: the lock is idle.
+    #
+    #   * < 0: the lock is acquired by a single writer.
+    #
+    state_ptr::Ptr{ReaderWriterLockState}
+end
+
+ReaderWriterLock(state_ptr::DevicePtr{ReaderWriterLockState}) = ReaderWriterLock(
+    convert(Ptr{ReaderWriterLockState}, convert(Csize_t, state_ptr)))
+
+const max_rw_lock_readers = (1 << (sizeof(ReaderWriterLockState) * 8 - 1))
+
+# Serializes execution of a function within a warp, to combat thread
+# divergence-related deadlocks.
+function warp_serialized(func::Function)
+    # Get the current thread's ID.
+    thread_id = threadIdx().x - 1
+
+    # Get the size of a warp.
+    size = warpsize()
+
+    local result
+    i = 0
+    while i < size
+        if thread_id % size == i
+            result = func()
+        end
+        i += 1
+    end
+    return result
+end
+
+"""
+    reader_locked(func::Function, lock::ReaderWriterLock)
+
+Acquires a reader-writer lock in reader mode, runs `func` while the lock is
+acquired and releases the lock again.
+"""
+function reader_locked(func::Function, lock::ReaderWriterLock)
+    warp_serialized() do
+        while true
+            # Increment the reader count. If the lock is in write-acquired mode,
+            # then the lock will stay in that mode (unless the reader count is
+            # exceeded, but that is virtually impossible). Otherwise, the lock
+            # will end up in read-acquired mode.
+            previous_state = atomic_add!(lock.state_ptr, 1)
+
+            # If the lock was in the idle or read-acquired state, then
+            # it is now in read-acquired mode.
+            if previous_state >= 0
+                # Run the function.
+                result = func()
+                # Decrement the reader count to release the reader lock.
+                atomic_add!(lock.state_ptr, -1)
+                # We're done here.
+                return result
+            end
+
+            # Decrement the reader count and try again.
+            atomic_add!(lock.state_ptr, -1)
+        end
+    end
+end
+
+"""
+    writer_locked(func::Function, lock::ReaderWriterLock)
+
+Acquires a reader-writer lock in writer mode, runs `func` while the lock is
+acquired and releases the lock again.
+"""
+function writer_locked(func::Function, lock::ReaderWriterLock)
+    warp_serialized() do
+        # Try to move the lock from 'idle' to 'write-acquired'.
+        while atomic_compare_exchange!(lock.state_ptr, 0, -max_rw_lock_readers) != 0
+        end
+
+        # We acquired the lock. Run the function.
+        result = func()
+
+        # Release the lock by atomically adding `max_rw_lock_readers` to the
+        # lock's state. It's important that we use an atomic add instead of a
+        # simple store because a store might cause a race condition with `read_locked`
+        # that'll put us in a deadlock state.
+        atomic_add!(lock.state_ptr, max_rw_lock_readers)
+
+        # We're done here.
+        return result
+    end
+end
diff --git a/src/interrupts.jl b/src/interrupts.jl
index 2eb6e7b1..03068387 100644
--- a/src/interrupts.jl
+++ b/src/interrupts.jl
@@ -51,45 +51,6 @@ function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint
         stream)
 end
 
-# Gets a pointer to a global with a particular name. If the global
-# does not exist yet, then it is declared in the global memory address
-# space.
-@generated function atomic_compare_exchange!(ptr::Ptr{T}, cmp::T, new::T)::T where T
-    ptr_type = convert(LLVMType, Ptr{T})
-    lt = string(convert(LLVMType, T))
-    ir = """
-        %ptr = inttoptr $ptr_type %0 to $lt*
-        %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst
-        %rv = extractvalue { $lt, i1 } %result, 0
-        ret $lt %rv
-        """
-    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new))
-end
-
-# Loads a value from a pointer.
-@generated function volatile_load(ptr::Ptr{T})::T where T
-    ptr_type = string(convert(LLVMType, Ptr{T}))
-    lt = string(convert(LLVMType, T))
-    ir = """
-        %ptr = inttoptr $ptr_type %0 to $lt*
-        %rv = load volatile $lt, $lt* %ptr
-        ret $lt %rv
-        """
-    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr))
-end
-
-# Stores a value at a particular address.
-@generated function volatile_store!(ptr::Ptr{T}, value::T) where T
-    ptr_type = string(convert(LLVMType, Ptr{T}))
-    lt = string(convert(LLVMType, T))
-    ir = """
-        %ptr = inttoptr $ptr_type %0 to $lt*
-        store volatile $lt %1, $lt* %ptr
-        ret void
-        """
-    :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value))
-end
-
 # Gets a pointer to a global with a particular name. If the global
 # does not exist yet, then it is declared in the global memory address
 # space.
diff --git a/test/device/threading.jl b/test/device/threading.jl
new file mode 100644
index 00000000..fa9533b1
--- /dev/null
+++ b/test/device/threading.jl
@@ -0,0 +1,91 @@
+@testset "threading" begin
+
+############################################################################################
+
+@testset "reader-writer lock" begin
+
+@testset "writers only" begin
+
+    thread_count = 128
+
+    # Define a kernel that atomically increments a counter using a lock.
+    function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState})
+        lock = ReaderWriterLock(lock_state)
+        writer_locked(lock) do
+            unsafe_store!(counter, unsafe_load(counter) + 1)
+        end
+        return
+    end
+
+    # Allocate memory for the counter and the lock.
+    counter_buf = Mem.alloc(sizeof(Int32))
+    Mem.upload!(counter_buf, [Int32(0)])
+    counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf)
+
+    lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState))
+    Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)])
+    lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf)
+
+    # Run the kernel.
+    @cuda threads=thread_count increment_counter(counter_pointer, lock_pointer)
+
+    # Check that the counter's final value equals the number
+    # of threads.
+    @test Mem.download(Int32, counter_buf) == [Int32(thread_count)]
+
+end
+
+@testset "readers and writers" begin
+
+    thread_count = 128
+
+    # Define a kernel.
+    function mutate_counter_maybe(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState})
+        i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+        lock = ReaderWriterLock(lock_state)
+        # Read the previous counter and update the current counter.
+        # Do this many times.
+        if i % 16 == 0
+            # Some threads get to atomically increment the counter.
+            writer_locked(lock) do
+                unsafe_store!(counter, unsafe_load(counter) + 1)
+            end
+        else
+            # All the other threads acquire the lock in reader mode
+            # and check that the counter's value doesn't change.
+            reader_locked(lock) do
+                counter_ptr = convert(Ptr{Int32}, convert(Csize_t, counter))
+                counter_val = CUDAnative.volatile_load(counter_ptr)
+                j = 0
+                while j < 10
+                    if CUDAnative.volatile_load(counter_ptr) != counter_val
+                        throw(ErrorException("oh no"))
+                    end
+                    j += 1
+                end
+            end
+        end
+        return
+    end
+
+    # Allocate memory for the counter and the lock.
+    counter_buf = Mem.alloc(sizeof(Int32))
+    Mem.upload!(counter_buf, [Int32(0)])
+    counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf)
+
+    lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState))
+    Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)])
+    lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf)
+
+    # Run the kernel.
+    @cuda threads=thread_count mutate_counter_maybe(counter_pointer, lock_pointer)
+
+    # Check that the counter's final value equals the number
+    # of threads.
+    @test Mem.download(Int32, counter_buf) == [Int32(thread_count / 16)]
+
+end
+
+end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index f382330d..05e1687f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -69,6 +69,8 @@ if CUDAnative.configured
             include("device/pointer.jl")
             include("device/array.jl")
             include("device/cuda.jl")
+            include("device/intrinsics.jl")
+            include("device/threading.jl")
 
             #include("examples.jl")
         end

From cfb6dd8e79385f1fa95136b57f748b5a4fbe012f Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 5 Mar 2019 18:07:30 +0100
Subject: [PATCH 020/146] Create an allocator prototype for the GC

---
 examples/gc-malloc.jl |  30 ++++
 src/CUDAnative.jl     |   1 +
 src/gc.jl             | 325 ++++++++++++++++++++++++++++++++++++++++++
 src/interrupts.jl     |   2 +-
 4 files changed, 357 insertions(+), 1 deletion(-)
 create mode 100644 examples/gc-malloc.jl
 create mode 100644 src/gc.jl

diff --git a/examples/gc-malloc.jl b/examples/gc-malloc.jl
new file mode 100644
index 00000000..597ed2ae
--- /dev/null
+++ b/examples/gc-malloc.jl
@@ -0,0 +1,30 @@
+using CUDAdrv, CUDAnative
+using Test
+
+# Define a kernel that copies values using a temporary buffer.
+function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    buffer = Base.unsafe_convert(Ptr{Float32}, gc_malloc(sizeof(Float32) * Csize_t(16)))
+
+    unsafe_store!(buffer, unsafe_load(a, i), i % 13)
+    unsafe_store!(b, unsafe_load(buffer, i % 13), i)
+
+    return
+end
+
+thread_count = 64
+
+# Allocate two arrays.
+source_array = Mem.alloc(Float32, thread_count)
+destination_array = Mem.alloc(Float32, thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
+
+# Fill the source and destination arrays.
+Mem.upload!(source_array, fill(42.f0, thread_count))
+Mem.upload!(destination_array, zeros(Float32, thread_count))
+
+# Run the kernel.
+@cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+
+@test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count)
diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index 6f18eeb1..38d6dd3c 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -39,6 +39,7 @@ include(joinpath("device", "threading.jl"))
 include("compiler.jl")
 include("execution.jl")
 include("interrupts.jl")
+include("gc.jl")
 include("reflection.jl")
 
 include("deprecated.jl")
diff --git a/src/gc.jl b/src/gc.jl
new file mode 100644
index 00000000..fb595e8e
--- /dev/null
+++ b/src/gc.jl
@@ -0,0 +1,325 @@
+# This file contains a GC implementation for CUDAnative kernels.
+#
+# CURRENT STATE OF THE GC
+#
+# Simple memory allocation is underway. Memory allocation currently
+# uses a simple free-list.
+#
+# END GOAL
+#
+# The CUDAnative GC is a precise, non-moving, mark-and-sweep GC that runs
+# on the host. The device may trigger the GC via an interrupt.
+#
+# Some GPU-related GC implementation details:
+#
+#   * GC memory is shared by the host and device.
+#   * Every thread gets a fixed region of memory for storing GC roots in.
+#   * When the device runs out of GC memory, it requests an interrupt
+#     to mark and sweep.
+
+export @cuda_gc, gc_malloc
+
+# An entry in the GC's free list. Every entry is placed at the
+# start of an free memory chunk. The `next` pointer of a GC free
+# list entry is aligned to a 16-byte boundary.
+struct GCFreeListEntry
+    # The size of the entry. This size does not include the entry's
+    # `size` field, but it does include the `next` field.
+    size::Csize_t
+    # A pointer to the next entry in the free list.
+    next::Ptr{GCFreeListEntry}
+end
+
+@generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name}
+    index = Base.fieldindex(TBase, field_name)
+    offset = Base.fieldoffset(TBase, index)
+    type = Core.fieldtype(TBase, index)
+    :(Base.unsafe_convert(Ptr{$type}, base_pointer + $(offset)))
+end
+
+# Gets a pointer to a particular field.
+macro get_field_pointer(base_pointer, field_name)
+    :(get_field_pointer_impl($(esc(base_pointer)), Val($field_name)))
+end
+
+# A data structure that contains information relevant
+# to the GC's inner workings.
+struct GCMemoryInfo
+    # The head of the free list.
+    free_list_head::Ptr{GCFreeListEntry}
+end
+
+# Gets the global GC interrupt lock.
+@inline function get_interrupt_lock()::ReaderWriterLock
+    return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState))
+end
+
+# Gets a pointer to the global GC info data structure pointer.
+@inline function get_gc_info_pointer()::Ptr{Ptr{GCMemoryInfo}}
+    return @cuda_global_ptr("gc_info_pointer", Ptr{GCMemoryInfo})
+end
+
+const gc_align = Csize_t(16)
+
+# Aligns a pointer to an alignment boundary.
+function align_to_boundary(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T
+    address_int = Base.convert(Csize_t, address)
+    remainder = address_int % alignment
+    if remainder == Csize_t(0)
+        return address
+    else
+        return address + alignment - remainder
+    end
+end
+
+# Tries to use a free-list entry to allocate a chunk of data of size `bytesize`.
+# Updates the free list if the allocation succeeds. Returns a null pointer otherwise.
+function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCFreeListEntry}}, entry::Ptr{GCFreeListEntry}, bytesize::Csize_t)::Ptr{UInt8}
+    entry_data = unsafe_load(entry)
+    if entry_data.size < bytesize
+        # The entry is just too small. Return a `null` pointer.
+        return C_NULL
+    end
+
+    # The entry's big enough, so we'll use it. If at all possible, we want
+    # to create a new entry from any unused memory in the entry.
+
+    # Compute the address to return.
+    data_address = Base.unsafe_convert(Ptr{UInt8}, entry) + sizeof(Csize_t)
+
+    # Compute the end of the free memory chunk.
+    end_address = data_address + entry_data.size
+
+    # Compute the start address of the new free list entry. The `next`
+    # field of that entry needs to be aligned to a 16-byte boundary,
+    # but the `size` field doesn't.
+    new_data_address = align_to_boundary(data_address + bytesize)
+    new_entry_address = new_data_address - sizeof(Csize_t)
+    if new_entry_address < data_address + bytesize
+        new_entry_address += gc_align
+    end
+
+    # If we can place a new entry just past the allocation, then we should
+    # by all means do so.
+    if new_entry_address + sizeof(GCFreeListEntry) < end_address
+        # Create a new free list entry.
+        new_entry_size = Csize_t(end_address) - Csize_t(new_data_address)
+        new_entry_ptr = Base.unsafe_convert(Ptr{GCFreeListEntry}, new_entry_address)
+        unsafe_store!(
+            new_entry_ptr,
+            GCFreeListEntry(new_entry_size, entry_data.next))
+
+        # Update this entry's `size` field to reflect the new entry's space
+        # requirements.
+        unsafe_store!(
+            @get_field_pointer(entry, :size)::Ptr{Csize_t},
+            entry_data.size - new_entry_size - sizeof(GCFreeListEntry))
+
+        # Update the free list pointer.
+        unsafe_store!(entry_ptr, new_entry_ptr)
+    else
+        # We can't create a new entry, but we still have to update the free
+        # list pointer.
+        unsafe_store!(entry_ptr, entry_data.next)
+    end
+
+    return data_address
+end
+
+# Tries to allocate a chunk of memory from a free list.
+# Returns a null pointer if no sufficiently large chunk of
+# memory can be found.
+#
+# `free_list_ptr` is a pointer to the head of the free list.
+#
+# This function is not thread-safe.
+function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCFreeListEntry}}, bytesize::Csize_t)::Ptr{UInt8}
+    # To allocate memory, we will walk the free list until we find a suitable candidate.
+    while free_list_ptr != C_NULL
+        free_list_item = unsafe_load(free_list_ptr)
+
+        if free_list_item == C_NULL
+            break
+        end
+
+        result = gc_use_free_list_entry(free_list_ptr, free_list_item, bytesize)
+        if result != C_NULL
+            return result
+        end
+
+        free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{GCFreeListEntry}}
+    end
+    return C_NULL
+end
+
+# Tries to allocate a chunk of memory.
+# Returns a null pointer if no sufficiently large chunk of
+# memory can be found.
+function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UInt8}
+    # TODO: reader-lock on the interrupt lock and writer-lock on the GC's
+    # lock.
+    writer_locked(get_interrupt_lock()) do
+        free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCFreeListEntry}}
+        return gc_malloc_from_free_list(free_list_ptr, bytesize)
+    end
+end
+
+# Allocates a blob of memory that is managed by the garbage collector.
+# This function is designed to be called by the device.
+function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
+    gc_info = unsafe_load(get_gc_info_pointer())
+
+    # Try to malloc the object without host intervention.
+    ptr = gc_malloc_local(gc_info, bytesize)
+    if ptr != C_NULL
+        return ptr
+    end
+
+    # We're out of memory. Ask the host to step in.
+    writer_locked(get_interrupt_lock()) do
+        interrupt_or_wait()
+    end
+
+    # Try to malloc again.
+    ptr = gc_malloc_local(gc_info, bytesize)
+    if ptr != C_NULL
+        return ptr
+    end
+
+    # Alright, so that was a spectacular failure. Let's just throw an exception.
+    @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", bytesize)
+    # throw(OutOfMemoryError())
+    return C_NULL
+end
+
+# Set the initial size of the chunk of memory allocated to the
+# GC to 16MiB.
+const initial_gc_memory_size = 16 * (1 << 20)
+
+# Initializes GC memory.
+function gc_init(buffer::Array{UInt8, 1})
+    buffer_ptr = pointer(buffer, 1)
+
+    # Create a single free list entry.
+    first_entry_ptr = Base.unsafe_convert(Ptr{GCFreeListEntry}, buffer_ptr + sizeof(GCMemoryInfo))
+    unsafe_store!(
+        first_entry_ptr,
+        GCFreeListEntry(
+            length(buffer) - sizeof(Csize_t) - sizeof(GCMemoryInfo),
+            C_NULL))
+
+    # Set up the main GC data structure.
+    gc_info = Base.unsafe_convert(Ptr{GCMemoryInfo}, buffer_ptr)
+    unsafe_store!(
+        gc_info,
+        GCMemoryInfo(first_entry_ptr))
+end
+
+# Triggers a GC collection.
+function gc_collect(info::Ptr{GCMemoryInfo})
+    println("GC collections are not implemented yet.")
+end
+
+"""
+    @cuda_gc [kwargs...] func(args...)
+
+High-level interface for executing code on a GPU with GC support.
+The `@cuda_gc` macro should prefix a call, with `func` a callable function
+or object that should return nothing. It will be compiled to a CUDA function upon first
+use, and to a certain extent arguments will be converted and anaged automatically using
+`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel
+launch on the current CUDA context.
+
+Several keyword arguments are supported that influence kernel compilation and execution. For
+more information, refer to the documentation of respectively [`cufunction`](@ref) and
+[`CUDAnative.Kernel`](@ref).
+"""
+macro cuda_gc(ex...)
+    # destructure the `@cuda_gc` expression
+    if length(ex) > 0 && ex[1].head == :tuple
+        error("The tuple argument to @cuda has been replaced by keywords: `@cuda_gc threads=... fun(args...)`")
+    end
+    call = ex[end]
+    kwargs = ex[1:end-1]
+
+    # destructure the kernel call
+    if call.head != :call
+        throw(ArgumentError("second argument to @cuda_gc should be a function call"))
+    end
+    f = call.args[1]
+    args = call.args[2:end]
+
+    code = quote end
+    compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs)
+    vars, var_exprs = CUDAnative.assign_args!(code, args)
+
+    # Find the stream on which the kernel is to be scheduled.
+    stream = CuDefaultStream()
+    for kwarg in call_kwargs
+        key, val = kwarg.args
+        if key == :stream
+            stream = val
+        end
+    end
+
+    # convert the arguments, call the compiler and launch the kernel
+    # while keeping the original arguments alive
+    push!(code.args,
+        quote
+            GC.@preserve $(vars...) begin
+                # Define a trivial buffer that contains the interrupt state.
+                local host_interrupt_array, device_interrupt_buffer = alloc_shared_array((1,), ready)
+
+                # Allocate a shared buffer for GC memory.
+                local host_gc_array, device_gc_buffer = alloc_shared_array((initial_gc_memory_size,), UInt8(0))
+                gc_init(host_gc_array)
+
+                # Define a kernel initialization function.
+                local function kernel_init(kernel)
+                    # Set the interrupt state pointer.
+                    try
+                        global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
+                        set(global_handle, CuPtr{UInt32}(device_interrupt_buffer.ptr))
+                    catch exception
+                        # The interrupt pointer may not have been declared (because it is unused).
+                        # In that case, we should do nothing.
+                        if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+                            rethrow()
+                        end
+                    end
+
+                    # Set the GC state pointer.
+                    try
+                        global_handle = CuGlobal{CuPtr{GCMemoryInfo}}(kernel.mod, "gc_info_pointer")
+                        set(global_handle, CuPtr{GCMemoryInfo}(device_gc_buffer.ptr))
+                    catch exception
+                        # The GC info pointer may not have been declared (because it is unused).
+                        # In that case, we should do nothing.
+                        if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+                            rethrow()
+                        end
+                    end
+                end
+
+                local function handle_interrupt()
+                    gc_collect(Ptr{GCMemoryInfo}(pointer(host_gc_array, 1)))
+                end
+
+                try
+                    # Standard kernel setup logic.
+                    local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
+                    local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
+                    local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
+                    CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...))
+                    kernel(kernel_args...; $(map(esc, call_kwargs)...))
+
+                    # Handle interrupts.
+                    handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream)))
+                finally
+                    free_shared_array(device_interrupt_buffer)
+                    free_shared_array(device_gc_buffer)
+                end
+            end
+         end)
+    return code
+end
diff --git a/src/interrupts.jl b/src/interrupts.jl
index 03068387..333545bf 100644
--- a/src/interrupts.jl
+++ b/src/interrupts.jl
@@ -187,7 +187,7 @@ end
 """
     @cuda_interruptible [kwargs...] func(args...)
 
-High-level interface for executing code on a GPU with support for interrups.
+High-level interface for executing code on a GPU with support for interrupts.
 The `@cuda_interruptible` macro should prefix a call, with `func` a callable function
 or object that should return nothing. It will be compiled to a CUDA function upon first
 use, and to a certain extent arguments will be converted and anaged automatically using

From 279f6ff30a71146a40c48315b7141b3d959edc96 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 11:38:49 +0100
Subject: [PATCH 021/146] Rename 'GCFreeListEntry' to 'GCAllocationRecord'

---
 src/gc.jl | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index fb595e8e..65f7942b 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -5,6 +5,13 @@
 # Simple memory allocation is underway. Memory allocation currently
 # uses a simple free-list.
 #
+# MEMORY ALLOCATION
+#
+# The GC's allocator uses free lists, i.e., the allocator maintains
+# a list of all blocks that have not been allocated. Additionally,
+# the allocator also maintains a list of all allocated blocks, so
+# the collector knows which blocks it can free.
+#
 # END GOAL
 #
 # The CUDAnative GC is a precise, non-moving, mark-and-sweep GC that runs
@@ -19,15 +26,14 @@
 
 export @cuda_gc, gc_malloc
 
-# An entry in the GC's free list. Every entry is placed at the
-# start of an free memory chunk. The `next` pointer of a GC free
-# list entry is aligned to a 16-byte boundary.
-struct GCFreeListEntry
-    # The size of the entry. This size does not include the entry's
-    # `size` field, but it does include the `next` field.
+# A data structure that precedes every chunk of memory that has been
+# allocated or put into the free list.
+struct GCAllocationRecord
+    # The size of the memory region this allocation record precedes.
+    # This size does not include the allocation record itself.
     size::Csize_t
     # A pointer to the next entry in the free list.
-    next::Ptr{GCFreeListEntry}
+    next::Ptr{GCAllocationRecord}
 end
 
 @generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name}
@@ -46,7 +52,7 @@ end
 # to the GC's inner workings.
 struct GCMemoryInfo
     # The head of the free list.
-    free_list_head::Ptr{GCFreeListEntry}
+    free_list_head::Ptr{GCAllocationRecord}
 end
 
 # Gets the global GC interrupt lock.
@@ -74,7 +80,7 @@ end
 
 # Tries to use a free-list entry to allocate a chunk of data of size `bytesize`.
 # Updates the free list if the allocation succeeds. Returns a null pointer otherwise.
-function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCFreeListEntry}}, entry::Ptr{GCFreeListEntry}, bytesize::Csize_t)::Ptr{UInt8}
+function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry::Ptr{GCAllocationRecord}, bytesize::Csize_t)::Ptr{UInt8}
     entry_data = unsafe_load(entry)
     if entry_data.size < bytesize
         # The entry is just too small. Return a `null` pointer.
@@ -101,19 +107,19 @@ function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCFreeListEntry}}, entry::Ptr
 
     # If we can place a new entry just past the allocation, then we should
     # by all means do so.
-    if new_entry_address + sizeof(GCFreeListEntry) < end_address
+    if new_entry_address + sizeof(GCAllocationRecord) < end_address
         # Create a new free list entry.
         new_entry_size = Csize_t(end_address) - Csize_t(new_data_address)
-        new_entry_ptr = Base.unsafe_convert(Ptr{GCFreeListEntry}, new_entry_address)
+        new_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, new_entry_address)
         unsafe_store!(
             new_entry_ptr,
-            GCFreeListEntry(new_entry_size, entry_data.next))
+            GCAllocationRecord(new_entry_size, entry_data.next))
 
         # Update this entry's `size` field to reflect the new entry's space
         # requirements.
         unsafe_store!(
             @get_field_pointer(entry, :size)::Ptr{Csize_t},
-            entry_data.size - new_entry_size - sizeof(GCFreeListEntry))
+            entry_data.size - new_entry_size - sizeof(GCAllocationRecord))
 
         # Update the free list pointer.
         unsafe_store!(entry_ptr, new_entry_ptr)
@@ -133,7 +139,7 @@ end
 # `free_list_ptr` is a pointer to the head of the free list.
 #
 # This function is not thread-safe.
-function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCFreeListEntry}}, bytesize::Csize_t)::Ptr{UInt8}
+function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCAllocationRecord}}, bytesize::Csize_t)::Ptr{UInt8}
     # To allocate memory, we will walk the free list until we find a suitable candidate.
     while free_list_ptr != C_NULL
         free_list_item = unsafe_load(free_list_ptr)
@@ -147,7 +153,7 @@ function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCFreeListEntry}}, byte
             return result
         end
 
-        free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{GCFreeListEntry}}
+        free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{GCAllocationRecord}}
     end
     return C_NULL
 end
@@ -159,7 +165,7 @@ function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UIn
     # TODO: reader-lock on the interrupt lock and writer-lock on the GC's
     # lock.
     writer_locked(get_interrupt_lock()) do
-        free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCFreeListEntry}}
+        free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCAllocationRecord}}
         return gc_malloc_from_free_list(free_list_ptr, bytesize)
     end
 end
@@ -201,10 +207,10 @@ function gc_init(buffer::Array{UInt8, 1})
     buffer_ptr = pointer(buffer, 1)
 
     # Create a single free list entry.
-    first_entry_ptr = Base.unsafe_convert(Ptr{GCFreeListEntry}, buffer_ptr + sizeof(GCMemoryInfo))
+    first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, buffer_ptr + sizeof(GCMemoryInfo))
     unsafe_store!(
         first_entry_ptr,
-        GCFreeListEntry(
+        GCAllocationRecord(
             length(buffer) - sizeof(Csize_t) - sizeof(GCMemoryInfo),
             C_NULL))
 

From c0c06e2882e64bbc11c265c982137880d7ed307a Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 11:47:13 +0100
Subject: [PATCH 022/146] Avoid partially overwriting allocation records

---
 src/gc.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 65f7942b..e6939247 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -91,16 +91,16 @@ function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry::
     # to create a new entry from any unused memory in the entry.
 
     # Compute the address to return.
-    data_address = Base.unsafe_convert(Ptr{UInt8}, entry) + sizeof(Csize_t)
+    data_address = Base.unsafe_convert(Ptr{UInt8}, entry) + sizeof(GCAllocationRecord)
 
     # Compute the end of the free memory chunk.
     end_address = data_address + entry_data.size
 
-    # Compute the start address of the new free list entry. The `next`
-    # field of that entry needs to be aligned to a 16-byte boundary,
-    # but the `size` field doesn't.
+    # Compute the start address of the new free list entry. The data
+    # prefixed by the block needs to be aligned to a 16-byte boundary,
+    # but the block itself doesn't.
     new_data_address = align_to_boundary(data_address + bytesize)
-    new_entry_address = new_data_address - sizeof(Csize_t)
+    new_entry_address = new_data_address - sizeof(GCAllocationRecord)
     if new_entry_address < data_address + bytesize
         new_entry_address += gc_align
     end
@@ -119,7 +119,7 @@ function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry::
         # requirements.
         unsafe_store!(
             @get_field_pointer(entry, :size)::Ptr{Csize_t},
-            entry_data.size - new_entry_size - sizeof(GCAllocationRecord))
+            Csize_t(new_entry_address) - Csize_t(data_address))
 
         # Update the free list pointer.
         unsafe_store!(entry_ptr, new_entry_ptr)
@@ -211,7 +211,7 @@ function gc_init(buffer::Array{UInt8, 1})
     unsafe_store!(
         first_entry_ptr,
         GCAllocationRecord(
-            length(buffer) - sizeof(Csize_t) - sizeof(GCMemoryInfo),
+            length(buffer) - sizeof(GCAllocationRecord) - sizeof(GCMemoryInfo),
             C_NULL))
 
     # Set up the main GC data structure.

From 79dc0d434adf14175a92a715ec051fefb04a6f85 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 11:58:20 +0100
Subject: [PATCH 023/146] Refactor GC collection triggering logic

---
 src/gc.jl | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index e6939247..63e13a40 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -32,7 +32,11 @@ struct GCAllocationRecord
     # The size of the memory region this allocation record precedes.
     # This size does not include the allocation record itself.
     size::Csize_t
-    # A pointer to the next entry in the free list.
+
+    # A pointer to the next allocation record in the list. If this
+    # allocation record is part of the free list, then this pointer
+    # points to the next free list entry; otherwise, it points to the
+    # next entry in the list of allocated blocks.
     next::Ptr{GCAllocationRecord}
 end
 
@@ -170,8 +174,12 @@ function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UIn
     end
 end
 
-# Allocates a blob of memory that is managed by the garbage collector.
-# This function is designed to be called by the device.
+"""
+    gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
+
+Allocates a blob of memory that is managed by the garbage collector.
+This function is designed to be called by the device.
+"""
 function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     gc_info = unsafe_load(get_gc_info_pointer())
 
@@ -182,9 +190,7 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     end
 
     # We're out of memory. Ask the host to step in.
-    writer_locked(get_interrupt_lock()) do
-        interrupt_or_wait()
-    end
+    gc_collect()
 
     # Try to malloc again.
     ptr = gc_malloc_local(gc_info, bytesize)
@@ -198,6 +204,19 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     return C_NULL
 end
 
+"""
+    gc_collect()
+
+Triggers a garbage collection phase. This function is designed
+to be called by the device rather than by the host.
+"""
+function gc_collect()
+    writer_locked(get_interrupt_lock()) do
+        interrupt_or_wait()
+        threadfence_system()
+    end
+end
+
 # Set the initial size of the chunk of memory allocated to the
 # GC to 16MiB.
 const initial_gc_memory_size = 16 * (1 << 20)
@@ -221,8 +240,9 @@ function gc_init(buffer::Array{UInt8, 1})
         GCMemoryInfo(first_entry_ptr))
 end
 
-# Triggers a GC collection.
-function gc_collect(info::Ptr{GCMemoryInfo})
+# Collects garbage. This function is designed to be called by
+# the host, not by the device.
+function gc_collect_impl(info::Ptr{GCMemoryInfo})
     println("GC collections are not implemented yet.")
 end
 
@@ -308,7 +328,7 @@ macro cuda_gc(ex...)
                 end
 
                 local function handle_interrupt()
-                    gc_collect(Ptr{GCMemoryInfo}(pointer(host_gc_array, 1)))
+                    gc_collect_impl(Ptr{GCMemoryInfo}(pointer(host_gc_array, 1)))
                 end
 
                 try

From 563c3c052827e3f1edd33813c60c8ec232ab1ea7 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 12:08:25 +0100
Subject: [PATCH 024/146] Have the GC maintain a list of allocated blocks

---
 src/gc.jl | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 63e13a40..77dedbaf 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -57,6 +57,9 @@ end
 struct GCMemoryInfo
     # The head of the free list.
     free_list_head::Ptr{GCAllocationRecord}
+
+    # The head of the allocation list.
+    allocation_list_head::Ptr{GCAllocationRecord}
 end
 
 # Gets the global GC interrupt lock.
@@ -84,7 +87,12 @@ end
 
 # Tries to use a free-list entry to allocate a chunk of data of size `bytesize`.
 # Updates the free list if the allocation succeeds. Returns a null pointer otherwise.
-function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry::Ptr{GCAllocationRecord}, bytesize::Csize_t)::Ptr{UInt8}
+function gc_use_free_list_entry(
+    entry_ptr::Ptr{Ptr{GCAllocationRecord}},
+    allocation_list_ptr::Ptr{Ptr{GCAllocationRecord}},
+    entry::Ptr{GCAllocationRecord},
+    bytesize::Csize_t,)::Ptr{UInt8}
+
     entry_data = unsafe_load(entry)
     if entry_data.size < bytesize
         # The entry is just too small. Return a `null` pointer.
@@ -133,6 +141,18 @@ function gc_use_free_list_entry(entry_ptr::Ptr{Ptr{GCAllocationRecord}}, entry::
         unsafe_store!(entry_ptr, entry_data.next)
     end
 
+    # At this point, all we need to do is update the allocation record to
+    # reflect the fact that it now represents an allocated block instead of
+    # a free block.
+
+    # Set the `next` pointer to the value stored at the allocation list pointer.
+    unsafe_store!(
+        @get_field_pointer(entry, :next)::Ptr{Ptr{GCAllocationRecord}},
+        unsafe_load(allocation_list_ptr))
+
+    # Update the allocation list pointer to point to the entry.
+    unsafe_store!(allocation_list_ptr, entry)
+
     return data_address
 end
 
@@ -141,9 +161,13 @@ end
 # memory can be found.
 #
 # `free_list_ptr` is a pointer to the head of the free list.
+# `allocation_list_ptr` is a pointer to the head of the allocation list.
 #
 # This function is not thread-safe.
-function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCAllocationRecord}}, bytesize::Csize_t)::Ptr{UInt8}
+function gc_malloc_from_free_list(
+    free_list_ptr::Ptr{Ptr{GCAllocationRecord}},
+    allocation_list_ptr::Ptr{Ptr{GCAllocationRecord}},
+    bytesize::Csize_t)::Ptr{UInt8}
     # To allocate memory, we will walk the free list until we find a suitable candidate.
     while free_list_ptr != C_NULL
         free_list_item = unsafe_load(free_list_ptr)
@@ -152,7 +176,7 @@ function gc_malloc_from_free_list(free_list_ptr::Ptr{Ptr{GCAllocationRecord}}, b
             break
         end
 
-        result = gc_use_free_list_entry(free_list_ptr, free_list_item, bytesize)
+        result = gc_use_free_list_entry(free_list_ptr, allocation_list_ptr, free_list_item, bytesize)
         if result != C_NULL
             return result
         end
@@ -170,7 +194,8 @@ function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UIn
     # lock.
     writer_locked(get_interrupt_lock()) do
         free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCAllocationRecord}}
-        return gc_malloc_from_free_list(free_list_ptr, bytesize)
+        allocation_list_ptr = @get_field_pointer(gc_info, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}}
+        return gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
     end
 end
 
@@ -237,7 +262,7 @@ function gc_init(buffer::Array{UInt8, 1})
     gc_info = Base.unsafe_convert(Ptr{GCMemoryInfo}, buffer_ptr)
     unsafe_store!(
         gc_info,
-        GCMemoryInfo(first_entry_ptr))
+        GCMemoryInfo(first_entry_ptr, C_NULL))
 end
 
 # Collects garbage. This function is designed to be called by

From 4ffc62febe59eb55e517b44d3a1c33942f262cfd Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 12:35:48 +0100
Subject: [PATCH 025/146] Introduce the notion of a GC master record

---
 src/gc.jl | 63 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 77dedbaf..8496fd1d 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -52,9 +52,10 @@ macro get_field_pointer(base_pointer, field_name)
     :(get_field_pointer_impl($(esc(base_pointer)), Val($field_name)))
 end
 
-# A data structure that contains information relevant
-# to the GC's inner workings.
-struct GCMemoryInfo
+# A data structure that describes a single GC "arena", i.e.,
+# a section of the heap that is managed by the GC. Every arena
+# has its own free list and allocation list.
+struct GCArenaRecord
     # The head of the free list.
     free_list_head::Ptr{GCAllocationRecord}
 
@@ -62,14 +63,22 @@ struct GCMemoryInfo
     allocation_list_head::Ptr{GCAllocationRecord}
 end
 
+# A data structure that contains global GC info. This data
+# structure is designed to be immutable: it should not be changed
+# once the host has set it up.
+struct GCMasterRecord
+    # A pointer to the global GC arena.
+    global_arena::Ptr{GCArenaRecord}
+end
+
 # Gets the global GC interrupt lock.
 @inline function get_interrupt_lock()::ReaderWriterLock
     return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState))
 end
 
-# Gets a pointer to the global GC info data structure pointer.
-@inline function get_gc_info_pointer()::Ptr{Ptr{GCMemoryInfo}}
-    return @cuda_global_ptr("gc_info_pointer", Ptr{GCMemoryInfo})
+# Gets a pointer to the GC master record.
+@inline function get_gc_master_record()::Ptr{GCMasterRecord}
+    return @cuda_global_ptr("gc_master_record", GCMasterRecord)
 end
 
 const gc_align = Csize_t(16)
@@ -186,15 +195,15 @@ function gc_malloc_from_free_list(
     return C_NULL
 end
 
-# Tries to allocate a chunk of memory.
+# Tries to allocate a chunk of memory in a particular GC arena.
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
-function gc_malloc_local(gc_info::Ptr{GCMemoryInfo}, bytesize::Csize_t)::Ptr{UInt8}
+function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt8}
     # TODO: reader-lock on the interrupt lock and writer-lock on the GC's
     # lock.
     writer_locked(get_interrupt_lock()) do
-        free_list_ptr = @get_field_pointer(gc_info, :free_list_head)::Ptr{Ptr{GCAllocationRecord}}
-        allocation_list_ptr = @get_field_pointer(gc_info, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}}
+        free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}}
+        allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}}
         return gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
     end
 end
@@ -206,10 +215,10 @@ Allocates a blob of memory that is managed by the garbage collector.
 This function is designed to be called by the device.
 """
 function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
-    gc_info = unsafe_load(get_gc_info_pointer())
+    master_record = unsafe_load(get_gc_master_record())
 
     # Try to malloc the object without host intervention.
-    ptr = gc_malloc_local(gc_info, bytesize)
+    ptr = gc_malloc_local(master_record.global_arena, bytesize)
     if ptr != C_NULL
         return ptr
     end
@@ -218,7 +227,7 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     gc_collect()
 
     # Try to malloc again.
-    ptr = gc_malloc_local(gc_info, bytesize)
+    ptr = gc_malloc_local(master_record.global_arena, bytesize)
     if ptr != C_NULL
         return ptr
     end
@@ -246,28 +255,30 @@ end
 # GC to 16MiB.
 const initial_gc_memory_size = 16 * (1 << 20)
 
-# Initializes GC memory.
-function gc_init(buffer::Array{UInt8, 1})
+# Initializes GC memory and produces a master record.
+function gc_init(buffer::Array{UInt8, 1})::GCMasterRecord
     buffer_ptr = pointer(buffer, 1)
 
     # Create a single free list entry.
-    first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, buffer_ptr + sizeof(GCMemoryInfo))
+    first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, buffer_ptr + sizeof(GCArenaRecord))
     unsafe_store!(
         first_entry_ptr,
         GCAllocationRecord(
-            length(buffer) - sizeof(GCAllocationRecord) - sizeof(GCMemoryInfo),
+            length(buffer) - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord),
             C_NULL))
 
     # Set up the main GC data structure.
-    gc_info = Base.unsafe_convert(Ptr{GCMemoryInfo}, buffer_ptr)
+    global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, buffer_ptr)
     unsafe_store!(
-        gc_info,
-        GCMemoryInfo(first_entry_ptr, C_NULL))
+        global_arena,
+        GCArenaRecord(first_entry_ptr, C_NULL))
+
+    return GCMasterRecord(global_arena)
 end
 
 # Collects garbage. This function is designed to be called by
 # the host, not by the device.
-function gc_collect_impl(info::Ptr{GCMemoryInfo})
+function gc_collect_impl(master_record::GCMasterRecord)
     println("GC collections are not implemented yet.")
 end
 
@@ -323,7 +334,7 @@ macro cuda_gc(ex...)
 
                 # Allocate a shared buffer for GC memory.
                 local host_gc_array, device_gc_buffer = alloc_shared_array((initial_gc_memory_size,), UInt8(0))
-                gc_init(host_gc_array)
+                local master_record = gc_init(host_gc_array)
 
                 # Define a kernel initialization function.
                 local function kernel_init(kernel)
@@ -339,10 +350,10 @@ macro cuda_gc(ex...)
                         end
                     end
 
-                    # Set the GC state pointer.
+                    # Set the GC master record.
                     try
-                        global_handle = CuGlobal{CuPtr{GCMemoryInfo}}(kernel.mod, "gc_info_pointer")
-                        set(global_handle, CuPtr{GCMemoryInfo}(device_gc_buffer.ptr))
+                        global_handle = CuGlobal{GCMasterRecord}(kernel.mod, "gc_master_record")
+                        set(global_handle, master_record)
                     catch exception
                         # The GC info pointer may not have been declared (because it is unused).
                         # In that case, we should do nothing.
@@ -353,7 +364,7 @@ macro cuda_gc(ex...)
                 end
 
                 local function handle_interrupt()
-                    gc_collect_impl(Ptr{GCMemoryInfo}(pointer(host_gc_array, 1)))
+                    gc_collect_impl(master_record)
                 end
 
                 try

From 454a6ef134a71fa15dd7947a7374d2c2c4dcdfa4 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 12:54:29 +0100
Subject: [PATCH 026/146] Reserve GC memory for GC frames

---
 src/gc.jl | 67 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 8496fd1d..b549bad8 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -63,12 +63,21 @@ struct GCArenaRecord
     allocation_list_head::Ptr{GCAllocationRecord}
 end
 
+# A reference to a Julia object.
+const ObjectRef = Ptr{Nothing}
+
 # A data structure that contains global GC info. This data
 # structure is designed to be immutable: it should not be changed
 # once the host has set it up.
 struct GCMasterRecord
     # A pointer to the global GC arena.
     global_arena::Ptr{GCArenaRecord}
+
+    # The size of a GC root buffer.
+    root_buffer_size::Csize_t
+
+    # A pointer to a list of buffers that can be used to store GC roots in.
+    root_buffers::Ptr{ObjectRef}
 end
 
 # Gets the global GC interrupt lock.
@@ -251,29 +260,37 @@ function gc_collect()
     end
 end
 
-# Set the initial size of the chunk of memory allocated to the
-# GC to 16MiB.
-const initial_gc_memory_size = 16 * (1 << 20)
+# The initial size of the GC heap, currently 16 MiB.
+const initial_gc_heap_size = 16 * (1 << 20)
+
+# The default size of a root buffer, i.e., the max number of
+# roots that can be stored per thread. Currently set to
+# 256 roots. That's 2 KiB of roots per thread.
+const default_root_buffer_size = 256
 
 # Initializes GC memory and produces a master record.
-function gc_init(buffer::Array{UInt8, 1})::GCMasterRecord
-    buffer_ptr = pointer(buffer, 1)
+function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_size::Integer = default_root_buffer_size)::GCMasterRecord
+    # Compute the total size of all root buffers.
+    total_root_buffer_size = sizeof(ObjectRef) * default_root_buffer_size * thread_count
+    root_buffer_ptr = Base.unsafe_convert(Ptr{ObjectRef}, pointer(buffer, 1))
+
+    # Compute a pointer to the start of the heap.
+    heap_start_ptr = pointer(buffer, total_root_buffer_size + 1)
+    global_arena_size = length(buffer) - total_root_buffer_size - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord)
 
     # Create a single free list entry.
-    first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, buffer_ptr + sizeof(GCArenaRecord))
+    first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, heap_start_ptr + sizeof(GCArenaRecord))
     unsafe_store!(
         first_entry_ptr,
-        GCAllocationRecord(
-            length(buffer) - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord),
-            C_NULL))
+        GCAllocationRecord(global_arena_size, C_NULL))
 
     # Set up the main GC data structure.
-    global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, buffer_ptr)
+    global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, heap_start_ptr)
     unsafe_store!(
         global_arena,
         GCArenaRecord(first_entry_ptr, C_NULL))
 
-    return GCMasterRecord(global_arena)
+    return GCMasterRecord(global_arena, root_buffer_size, root_buffer_ptr)
 end
 
 # Collects garbage. This function is designed to be called by
@@ -282,6 +299,18 @@ function gc_collect_impl(master_record::GCMasterRecord)
     println("GC collections are not implemented yet.")
 end
 
+# Examines a keyword argument list and gets either the value
+# assigned to a key or a default value.
+function get_kwarg_or_default(kwarg_list, key::Symbol, default)
+    for kwarg in kwarg_list
+        arg_key, val = kwarg.args
+        if arg_key == key
+            return val
+        end
+    end
+    return default
+end
+
 """
     @cuda_gc [kwargs...] func(args...)
 
@@ -316,13 +345,10 @@ macro cuda_gc(ex...)
     vars, var_exprs = CUDAnative.assign_args!(code, args)
 
     # Find the stream on which the kernel is to be scheduled.
-    stream = CuDefaultStream()
-    for kwarg in call_kwargs
-        key, val = kwarg.args
-        if key == :stream
-            stream = val
-        end
-    end
+    stream = get_kwarg_or_default(call_kwargs, :stream, CuDefaultStream())
+
+    # Get the total number of threads.
+    thread_count = get_kwarg_or_default(call_kwargs, :threads, 1)
 
     # convert the arguments, call the compiler and launch the kernel
     # while keeping the original arguments alive
@@ -333,8 +359,9 @@ macro cuda_gc(ex...)
                 local host_interrupt_array, device_interrupt_buffer = alloc_shared_array((1,), ready)
 
                 # Allocate a shared buffer for GC memory.
-                local host_gc_array, device_gc_buffer = alloc_shared_array((initial_gc_memory_size,), UInt8(0))
-                local master_record = gc_init(host_gc_array)
+                local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_size * $(esc(thread_count))
+                local host_gc_array, device_gc_buffer = alloc_shared_array((gc_memory_size,), UInt8(0))
+                local master_record = gc_init(host_gc_array, $(esc(thread_count)))
 
                 # Define a kernel initialization function.
                 local function kernel_init(kernel)

From 24c184fe6bc5e35cd902740739fa5c563b9d76ca Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 13:02:38 +0100
Subject: [PATCH 027/146] Have the GC allocate memory for root buffer sizes

---
 src/gc.jl | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index b549bad8..013f29e7 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -73,10 +73,16 @@ struct GCMasterRecord
     # A pointer to the global GC arena.
     global_arena::Ptr{GCArenaRecord}
 
-    # The size of a GC root buffer.
-    root_buffer_size::Csize_t
+    # The maximum size of a GC root buffer, i.e., the maximum number
+    # of roots per thread.
+    root_buffer_capacity::Csize_t
+
+    # A pointer to a buffer that describes the number of elements
+    # currently in each root buffer.
+    root_buffer_sizes::Ptr{Csize_t}
 
     # A pointer to a list of buffers that can be used to store GC roots in.
+    # These root buffers are partitioned into GC frames later on.
     root_buffers::Ptr{ObjectRef}
 end
 
@@ -263,20 +269,25 @@ end
 # The initial size of the GC heap, currently 16 MiB.
 const initial_gc_heap_size = 16 * (1 << 20)
 
-# The default size of a root buffer, i.e., the max number of
+# The default capacity of a root buffer, i.e., the max number of
 # roots that can be stored per thread. Currently set to
 # 256 roots. That's 2 KiB of roots per thread.
-const default_root_buffer_size = 256
+const default_root_buffer_capacity = 256
 
 # Initializes GC memory and produces a master record.
-function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_size::Integer = default_root_buffer_size)::GCMasterRecord
-    # Compute the total size of all root buffers.
-    total_root_buffer_size = sizeof(ObjectRef) * default_root_buffer_size * thread_count
-    root_buffer_ptr = Base.unsafe_convert(Ptr{ObjectRef}, pointer(buffer, 1))
+function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord
+    gc_memory_start_ptr = pointer(buffer, 1)
+    gc_memory_end_ptr = pointer(buffer, length(buffer))
+
+    # Set up root buffers.
+    sizebuf_bytesize = sizeof(Csize_t) * thread_count
+    sizebuf_ptr = gc_memory_start_ptr
+    rootbuf_bytesize = sizeof(ObjectRef) * default_root_buffer_capacity * thread_count
+    rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, sizebuf_ptr + sizebuf_bytesize)
 
     # Compute a pointer to the start of the heap.
-    heap_start_ptr = pointer(buffer, total_root_buffer_size + 1)
-    global_arena_size = length(buffer) - total_root_buffer_size - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord)
+    heap_start_ptr = rootbuf_ptr + rootbuf_bytesize
+    global_arena_size = Csize_t(gc_memory_end_ptr) - Csize_t(heap_start_ptr) - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord)
 
     # Create a single free list entry.
     first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, heap_start_ptr + sizeof(GCArenaRecord))
@@ -290,7 +301,7 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_siz
         global_arena,
         GCArenaRecord(first_entry_ptr, C_NULL))
 
-    return GCMasterRecord(global_arena, root_buffer_size, root_buffer_ptr)
+    return GCMasterRecord(global_arena, root_buffer_capacity, sizebuf_ptr, rootbuf_ptr)
 end
 
 # Collects garbage. This function is designed to be called by
@@ -359,7 +370,7 @@ macro cuda_gc(ex...)
                 local host_interrupt_array, device_interrupt_buffer = alloc_shared_array((1,), ready)
 
                 # Allocate a shared buffer for GC memory.
-                local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_size * $(esc(thread_count))
+                local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_capacity * $(esc(thread_count))
                 local host_gc_array, device_gc_buffer = alloc_shared_array((gc_memory_size,), UInt8(0))
                 local master_record = gc_init(host_gc_array, $(esc(thread_count)))
 

From 33e54b796914411fa1bfe772404bb6ac03287b0f Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 13:17:30 +0100
Subject: [PATCH 028/146] Use 32-bit integers to describe GC root buffer sizes

---
 src/gc.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 013f29e7..077fd5c2 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -24,7 +24,7 @@
 #   * When the device runs out of GC memory, it requests an interrupt
 #     to mark and sweep.
 
-export @cuda_gc, gc_malloc
+export @cuda_gc, gc_malloc, gc_collect
 
 # A data structure that precedes every chunk of memory that has been
 # allocated or put into the free list.
@@ -75,11 +75,11 @@ struct GCMasterRecord
 
     # The maximum size of a GC root buffer, i.e., the maximum number
     # of roots per thread.
-    root_buffer_capacity::Csize_t
+    root_buffer_capacity::UInt32
 
     # A pointer to a buffer that describes the number of elements
     # currently in each root buffer.
-    root_buffer_sizes::Ptr{Csize_t}
+    root_buffer_sizes::Ptr{UInt32}
 
     # A pointer to a list of buffers that can be used to store GC roots in.
     # These root buffers are partitioned into GC frames later on.
@@ -280,7 +280,7 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap
     gc_memory_end_ptr = pointer(buffer, length(buffer))
 
     # Set up root buffers.
-    sizebuf_bytesize = sizeof(Csize_t) * thread_count
+    sizebuf_bytesize = sizeof(Int32) * thread_count
     sizebuf_ptr = gc_memory_start_ptr
     rootbuf_bytesize = sizeof(ObjectRef) * default_root_buffer_capacity * thread_count
     rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, sizebuf_ptr + sizebuf_bytesize)

From 71aa78f0d55e90c59ab0a3cd440f747f2a9574bc Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 13:41:17 +0100
Subject: [PATCH 029/146] Define GC frame management functions

---
 src/gc.jl | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 4 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 077fd5c2..0a1fcf78 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -91,9 +91,94 @@ end
     return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState))
 end
 
-# Gets a pointer to the GC master record.
-@inline function get_gc_master_record()::Ptr{GCMasterRecord}
-    return @cuda_global_ptr("gc_master_record", GCMasterRecord)
+# Runs a function in such a way that no collection phases will
+# run as long as the function is executing. Use with care: this
+# function acquires the GC interrupt lock in reader mode, so careless
+# use may cause deadlocks.
+@inline function nocollect(func::Function)
+    return reader_locked(func, get_interrupt_lock())
+end
+
+# Gets the GC master record.
+@inline function get_gc_master_record()::GCMasterRecord
+    return unsafe_load(@cuda_global_ptr("gc_master_record", GCMasterRecord))
+end
+
+# Gets the thread ID of the current thread.
+@inline function get_thread_id()
+    return threadIdx().x
+end
+
+# Gets a pointer to the first element in the root buffer for this thread.
+@inline function get_root_buffer_start()::Ptr{ObjectRef}
+    master_record = get_gc_master_record()
+    offset = master_record.root_buffer_capacity * get_thread_id()
+    return master_record.root_buffers + offset * sizeof(ObjectRef)
+end
+
+"""
+    new_gc_frame(size::UInt32)::Ptr{ObjectRef}
+
+Allocates a new GC frame.
+"""
+function new_gc_frame(size::UInt32)::Ptr{ObjectRef}
+    nocollect() do
+        master_record = get_gc_master_record()
+
+        # Get the current size of the root buffer.
+        current_size = unsafe_load(
+            master_record.root_buffer_sizes,
+            get_thread_id())
+
+        # The size of a root buffer should never exceed its capacity.
+        @cuassert(current_size + size <= master_record.root_buffer_capacity)
+
+        return get_root_buffer_start() + current_size * sizeof(ObjectRef)
+    end
+end
+
+"""
+    push_gc_frame(size::UInt32)
+
+Registers a GC frame with the garbage collector.
+"""
+function push_gc_frame(size::UInt32)
+    nocollect() do
+        master_record = get_gc_master_record()
+
+        # Get the current size of the root buffer.
+        current_size = unsafe_load(
+            master_record.root_buffer_sizes,
+            get_thread_id())
+
+        # Add the new size to the current root buffer size.
+        unsafe_store!(
+            master_record.root_buffer_sizes,
+            current_size + size,
+            get_thread_id())
+    end
+end
+
+"""
+    pop_gc_frame(size::UInt32)
+
+Deregisters a GC frame.
+"""
+function pop_gc_frame(size::UInt32)
+    nocollect() do
+        master_record = get_gc_master_record()
+
+        # Get the current size of the root buffer.
+        current_size = unsafe_load(
+            master_record.root_buffer_sizes,
+            get_thread_id())
+
+        # Subtract the size from the current root buffer size.
+        unsafe_store!(
+            master_record.root_buffer_sizes,
+            current_size - size,
+            get_thread_id())
+    end
 end
 
 const gc_align = Csize_t(16)
@@ -230,7 +315,7 @@ Allocates a blob of memory that is managed by the garbage collector.
 This function is designed to be called by the device.
 """
 function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
-    master_record = unsafe_load(get_gc_master_record())
+    master_record = get_gc_master_record()
 
     # Try to malloc the object without host intervention.
     ptr = gc_malloc_local(master_record.global_arena, bytesize)

From da046af042903d6d9151086b83ce6a58391f9905 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 17:42:59 +0100
Subject: [PATCH 030/146] Make globals created by 'get_global_pointer'
 'linkonce_odr'

---
 src/interrupts.jl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/interrupts.jl b/src/interrupts.jl
index 333545bf..d70c2773 100644
--- a/src/interrupts.jl
+++ b/src/interrupts.jl
@@ -63,15 +63,16 @@ end
     mod = LLVM.parent(llvm_f)
 
     # Figure out if the global has been defined already.
-    globalSet = LLVM.globals(mod)
+    global_set = LLVM.globals(mod)
     global_name_string = String(global_name)
-    if haskey(globalSet, global_name_string)
-        global_var = globalSet[global_name_string]
+    if haskey(global_set, global_name_string)
+        global_var = global_set[global_name_string]
     else
         # If the global hasn't been defined already, then we'll define
         # it in the global address space, i.e., address space one.
         global_var = GlobalVariable(mod, T_global, global_name_string, 1)
-        LLVM.initializer!(global_var, LLVM.null(T_global))
+        linkage!(global_var, LLVM.API.LLVMLinkOnceODRLinkage)
+        initializer!(global_var, LLVM.null(T_global))
     end
 
     # Generate IR that computes the global's address.

From 159acd384e46d561634de5c6fb0e087c200ce0ed Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 17:43:19 +0100
Subject: [PATCH 031/146] Protect newly allocated objects from collection

---
 src/gc.jl | 72 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 25 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 0a1fcf78..9c425b09 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -56,6 +56,9 @@ end
 # a section of the heap that is managed by the GC. Every arena
 # has its own free list and allocation list.
 struct GCArenaRecord
+    # The allocation lock for the arena.
+    lock_state::ReaderWriterLockState
+
     # The head of the free list.
     free_list_head::Ptr{GCAllocationRecord}
 
@@ -93,10 +96,16 @@ end
 
 # Runs a function in such a way that no collection phases will
 # run as long as the function is executing. Use with care: this
-# function acquires the GC interrupt lock in reader mode, so careless
+# macro acquires the GC interrupt lock in reader mode, so careless
 # use may cause deadlocks.
-@inline function nocollect(func::Function)
-    return reader_locked(func, get_interrupt_lock())
+macro nocollect(func)
+    quote
+        local @inline function lock_callback()
+            $(esc(func))
+        end
+
+        reader_locked(lock_callback, get_interrupt_lock())
+    end
 end
 
 # Gets the GC master record.
@@ -116,25 +125,25 @@ end
     return master_record.root_buffers + offset * sizeof(ObjectRef)
 end
 
+# Same as 'new_gc_frame_impl', but does not disable collections.
+function new_gc_frame_impl(size::UInt32)::Ptr{ObjectRef}
+    master_record = get_gc_master_record()
+
+    # Get the current size of the root buffer.
+    current_size = unsafe_load(
+        master_record.root_buffer_sizes,
+        get_thread_id())
+
+    return get_root_buffer_start() + current_size * sizeof(ObjectRef)
+end
+
 """
     new_gc_frame(size::UInt32)::Ptr{ObjectRef}
 
 Allocates a new GC frame.
 """
 function new_gc_frame(size::UInt32)::Ptr{ObjectRef}
-    nocollect() do
-        master_record = get_gc_master_record()
-
-        # Get the current size of the root buffer.
-        current_size = unsafe_load(
-            master_record.root_buffer_sizes,
-            get_thread_id())
-
-        # The size of a root buffer should never exceed its capacity.
-        @cuassert(current_size + size <= master_record.root_buffer_capacity)
-
-        return get_root_buffer_start() + current_size * sizeof(ObjectRef)
-    end
+    @nocollect new_gc_frame_impl(size)
 end
 
 """
@@ -143,7 +152,7 @@ end
 Registers a GC frame with the garbage collector.
 """
 function push_gc_frame(size::UInt32)
-    nocollect() do
+    @nocollect begin
         master_record = get_gc_master_record()
 
         # Get the current size of the root buffer.
@@ -165,7 +174,7 @@ end
 Deregisters a GC frame.
 """
 function pop_gc_frame(size::UInt32)
-    nocollect() do
+    @nocollect begin
         master_record = get_gc_master_record()
 
         # Get the current size of the root buffer.
@@ -299,12 +308,25 @@ end
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
 function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt8}
-    # TODO: reader-lock on the interrupt lock and writer-lock on the GC's
-    # lock.
-    writer_locked(get_interrupt_lock()) do
-        free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}}
-        allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}}
-        return gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
+    # Disable collections and acquire the arena's lock.
+    @nocollect begin
+        arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state))
+        result_ptr = writer_locked(arena_lock) do
+            # Allocate a suitable region of memory.
+            free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}}
+            allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}}
+            gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
+        end
+
+        # If the resulting pointer is non-null, then we'll write it to a temporary GC frame.
+        # Our reasoning for doing this is that doing so ensures that the allocated memory
+        # won't get collected by the GC before the caller has a chance to add it to its
+        # own GC frame.
+        if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL)
+            gc_frame = new_gc_frame_impl(UInt32(1))
+            unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr))
+        end
+        return result_ptr
     end
 end
 
@@ -384,7 +406,7 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap
     global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, heap_start_ptr)
     unsafe_store!(
         global_arena,
-        GCArenaRecord(first_entry_ptr, C_NULL))
+        GCArenaRecord(0, first_entry_ptr, C_NULL))
 
     return GCMasterRecord(global_arena, root_buffer_capacity, sizebuf_ptr, rootbuf_ptr)
 end

From 2b772287fa1785f5e12648bfe5a3a31269283541 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 17:54:17 +0100
Subject: [PATCH 032/146] Introduce a separate GPU GC lowering pass

---
 src/compiler/common.jl |  10 +++-
 src/compiler/optim.jl  | 123 +++++++++++++++++++++++++++++++++++++++--
 src/gc.jl              |   2 +-
 3 files changed, 128 insertions(+), 7 deletions(-)

diff --git a/src/compiler/common.jl b/src/compiler/common.jl
index b9160a5f..33232b82 100644
--- a/src/compiler/common.jl
+++ b/src/compiler/common.jl
@@ -12,11 +12,17 @@ struct CompilerJob
     maxthreads::Union{Nothing,CuDim}
     blocks_per_sm::Union{Nothing,Integer}
     maxregs::Union{Nothing,Integer}
+    # Indicates whether the GPU GC or the "malloc never free"
+    # GC intrinsic lowering strategy is to be used. The former
+    # is used when this field is `true`; the latter when it is
+    # `false`.
+    gc::Bool
 
     CompilerJob(f, tt, cap, kernel;
                     minthreads=nothing, maxthreads=nothing,
-                    blocks_per_sm=nothing, maxregs=nothing) =
-        new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs)
+                    blocks_per_sm=nothing, maxregs=nothing,
+                    gc=false) =
+        new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, gc)
 end
 
 # global job reference
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 55b16ca6..70903a86 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -67,6 +67,14 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
     # PTX-specific optimizations
     ModulePassManager() do pm
         initialize!(pm)
+        # lower intrinsics
+        if ctx.gc
+            add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!))
+        else
+            add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!))
+        end
+        aggressive_dce!(pm) # remove dead uses of ptls
+        add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
         # NVPTX's target machine info enables runtime unrolling,
         # but Julia's pass sequence only invokes the simple unroller.
@@ -379,10 +387,117 @@ function eager_lower_gc_frame!(fun::LLVM.Function)
     return changed
 end
 
-# Lowers the GC intrinsics produced by the LateLowerGCFrame pass. These
-# intrinsics are the last point at which we can intervene in the pipeline
-# before the passes that deal with them become CPU-specific.
-function lower_final_gc_intrinsics!(mod::LLVM.Module)
+# Lowers the GC intrinsics produced by the LateLowerGCFrame pass to
+# use the "malloc, never free" strategy. These intrinsics are the
+# last point at which we can intervene in the pipeline before the
+# passes that deal with them become CPU-specific.
+function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module)
+    changed = false
+
+    # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates
+    # store for an object, including headroom, but does not set the object's
+    # tag.
+    visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes
+        gc_alloc_bytes_ft = eltype(llvmtype(gc_alloc_bytes))::LLVM.FunctionType
+        T_ret = return_type(gc_alloc_bytes_ft)::LLVM.PointerType
+        T_bitcast = LLVM.PointerType(T_ret, LLVM.addrspace(T_ret))
+
+        # Decode the call.
+        ops = collect(operands(call))
+        size = ops[2]
+
+        # We need to reserve a single pointer of headroom for the tag.
+        # (LateLowerGCFrame depends on us doing that.)
+        headroom = Runtime.tag_size
+
+        # Call the allocation function and bump the resulting pointer
+        # so the headroom sits just in front of the returned pointer.
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext()))
+            ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size])
+            cast_ptr = bitcast!(builder, ptr, T_bitcast)
+            bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())])
+            replace_uses!(call, bumped_ptr)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame.
+    # We'll lower it as an alloca and hope SSA construction and DCE passes
+    # get rid of the alloca. This is a reasonable thing to hope for because
+    # all intrinsics that may cause the GC frame to escape will be replaced by
+    # nops.
+    visit_calls_to("julia.new_gc_frame", mod) do call, new_gc_frame
+        new_gc_frame_ft = eltype(llvmtype(new_gc_frame))::LLVM.FunctionType
+        T_ret = return_type(new_gc_frame_ft)::LLVM.PointerType
+        T_alloca = eltype(T_ret)
+
+        # Decode the call.
+        ops = collect(operands(call))
+        size = ops[1]
+
+        # Call the allocation function and bump the resulting pointer
+        # so the headroom sits just in front of the returned pointer.
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            ptr = array_alloca!(builder, T_alloca, size)
+            replace_uses!(call, ptr)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # The 'julia.get_gc_frame_slot' is closely related to the previous
+    # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of
+    # a slot in the GC frame. We can simply turn this intrinsic into a GEP.
+    visit_calls_to("julia.get_gc_frame_slot", mod) do call, _
+        # Decode the call.
+        ops = collect(operands(call))
+        frame = ops[1]
+        offset = ops[2]
+
+        # Call the allocation function and bump the resulting pointer
+        # so the headroom sits just in front of the returned pointer.
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            ptr = gep!(builder, frame, [offset])
+            replace_uses!(call, ptr)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # The 'julia.push_gc_frame' registers a GC frame with the GC. We
+    # don't have a GC, so we can just delete calls to this intrinsic!
+    changed |= delete_calls_to!("julia.push_gc_frame", mod)
+
+    # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so
+    # we can just delete calls to this intrinsic, too.
+    changed |= delete_calls_to!("julia.pop_gc_frame", mod)
+
+    # Ditto for 'julia.queue_gc_root'.
+    changed |= delete_calls_to!("julia.queue_gc_root", mod)
+
+    return changed
+end
+
+"""
+lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
+
+An LLVM pass that lowers the GC intrinsics produced by the
+LateLowerGCFrame pass to use the GPU GC. These intrinsics are the
+last point at which we can intervene in the pipeline before the
+passes that deal with them become CPU-specific.
+"""
+function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
     changed = false
 
     # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates
diff --git a/src/gc.jl b/src/gc.jl
index 9c425b09..32de5311 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -516,7 +516,7 @@ macro cuda_gc(ex...)
                     # Standard kernel setup logic.
                     local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
                     local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-                    local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
+                    local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, $(map(esc, compiler_kwargs)...))
                     CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...))
                     kernel(kernel_args...; $(map(esc, call_kwargs)...))
 

From 9a3da04a4cd6347510b9b15bf2a459c304e28f5d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 18:28:44 +0100
Subject: [PATCH 033/146] Use 'gc_malloc' instead of regular 'malloc' when in
 GC mode

---
 src/compiler/optim.jl |  8 +++++---
 src/device/runtime.jl | 11 +++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 70903a86..ef42d155 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -418,7 +418,8 @@ function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module)
             ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size])
             cast_ptr = bitcast!(builder, ptr, T_bitcast)
             bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())])
-            replace_uses!(call, bumped_ptr)
+            result_ptr = bitcast!(builder, bumped_ptr, T_ret)
+            replace_uses!(call, result_ptr)
             unsafe_delete!(LLVM.parent(call), call)
             dispose(builder)
         end
@@ -521,10 +522,11 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
         let builder = Builder(JuliaContext())
             position!(builder, call)
             total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext()))
-            ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size])
+            ptr = call!(builder, Runtime.get(:gc_malloc_object), [total_size])
             cast_ptr = bitcast!(builder, ptr, T_bitcast)
             bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())])
-            replace_uses!(call, bumped_ptr)
+            result_ptr = bitcast!(builder, bumped_ptr, T_ret)
+            replace_uses!(call, result_ptr)
             unsafe_delete!(LLVM.parent(call), call)
             dispose(builder)
         end
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 1bf9fa5e..a331c9ee 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -225,5 +225,16 @@ for (T, t) in [Int8   => :int8,  Int16  => :int16,  Int32  => :int32,  Int64  =>
     end
 end
 
+"""
+    gc_malloc_object(bytesize::Csize_t)
+
+Allocates an object that is managed by the garbage collector.
+This function is designed to be called by the device.
+"""
+function gc_malloc_object(bytesize::Csize_t)
+    return unsafe_pointer_to_objref(gc_malloc(bytesize))
+end
+
+compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue)
 
 end

From 5bd8da4d10496b9057f0cfdeb0973aaa85a63b3f Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 18:43:02 +0100
Subject: [PATCH 034/146] Use pointers instead of integers to keep track of GC
 frames

---
 src/gc.jl | 71 ++++++++++++++++++++++---------------------------------
 1 file changed, 28 insertions(+), 43 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 32de5311..660614c3 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -80,9 +80,9 @@ struct GCMasterRecord
     # of roots per thread.
     root_buffer_capacity::UInt32
 
-    # A pointer to a buffer that describes the number of elements
-    # currently in each root buffer.
-    root_buffer_sizes::Ptr{UInt32}
+    # A pointer to a list of root buffer pointers that point to the
+    # end of the root buffer for every thread.
+    root_buffer_fingers::Ptr{Ptr{ObjectRef}}
 
     # A pointer to a list of buffers that can be used to store GC roots in.
     # These root buffers are partitioned into GC frames later on.
@@ -118,23 +118,13 @@ end
     return threadIdx().x
 end
 
-# Gets a pointer to the first element in the root buffer for this thread.
-@inline function get_root_buffer_start()::Ptr{ObjectRef}
-    master_record = get_gc_master_record()
-    offset = master_record.root_buffer_capacity * get_thread_id()
-    return master_record.root_buffers + offset * sizeof(ObjectRef)
-end
+const GCFrame = Ptr{ObjectRef}
 
 # Same as 'new_gc_frame_impl', but does not disable collections.
-function new_gc_frame_impl(size::UInt32)::Ptr{ObjectRef}
+function new_gc_frame_impl(size::UInt32)::GCFrame
     master_record = get_gc_master_record()
-
-    # Get the current size of the root buffer.
-    current_size = unsafe_load(
-        master_record.root_buffer_sizes,
-        get_thread_id())
-
-    return get_root_buffer_start() + current_size * sizeof(ObjectRef)
+    # Return the root buffer tip: that's where the new GC frame starts.
+    return unsafe_load(master_record.root_buffer_fingers, get_thread_id())
 end
 
 """
@@ -142,50 +132,40 @@ end
 
 Allocates a new GC frame.
 """
-function new_gc_frame(size::UInt32)::Ptr{ObjectRef}
+function new_gc_frame(size::UInt32)::GCFrame
     @nocollect new_gc_frame_impl(size)
 end
 
 """
-    push_gc_frame(size::UInt32)
+    push_gc_frame(gc_frame::GCFrame, size::UInt32)
 
 Registers a GC frame with the garbage collector.
 """
-function push_gc_frame(size::UInt32)
+function push_gc_frame(gc_frame::GCFrame, size::UInt32)
     @nocollect begin
         master_record = get_gc_master_record()
 
-        # Get the current size of the root buffer.
-        current_size = unsafe_load(
-            master_record.root_buffer_sizes,
-            get_thread_id())
-
-        # Add the new size to the current root buffer size.
+        # Update the root buffer tip.
         unsafe_store!(
-            master_record.root_buffer_sizes,
-            current_size + size,
+            master_record.root_buffer_fingers,
+            gc_frame + size * sizeof(ObjectRef),
             get_thread_id())
     end
 end
 
 """
-    pop_gc_frame(size::UInt32)
+    pop_gc_frame(gc_frame::GCFrame)
 
 Deregisters a GC frame.
 """
-function pop_gc_frame(size::UInt32)
+function pop_gc_frame(gc_frame::GCFrame)
     @nocollect begin
         master_record = get_gc_master_record()
 
-        # Get the current size of the root buffer.
-        current_size = unsafe_load(
-            master_record.root_buffer_sizes,
-            get_thread_id())
-
-        # Subtract the size from the current root buffer size.
+        # Update the root buffer tip.
         unsafe_store!(
-            master_record.root_buffer_sizes,
-            current_size - size,
+            master_record.root_buffer_fingers,
+            gc_frame,
             get_thread_id())
     end
 end
@@ -387,10 +367,15 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap
     gc_memory_end_ptr = pointer(buffer, length(buffer))
 
     # Set up root buffers.
-    sizebuf_bytesize = sizeof(Int32) * thread_count
-    sizebuf_ptr = gc_memory_start_ptr
-    rootbuf_bytesize = sizeof(ObjectRef) * default_root_buffer_capacity * thread_count
-    rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, sizebuf_ptr + sizebuf_bytesize)
+    fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
+    fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, gc_memory_start_ptr)
+    rootbuf_bytesize = sizeof(ObjectRef) * root_buffer_capacity * thread_count
+    rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, fingerbuf_ptr + fingerbuf_bytesize)
+
+    # Populate the root buffer fingers.
+    for i in 1:thread_count
+        unsafe_store!(fingerbuf_ptr, rootbuf_ptr + i * sizeof(ObjectRef) * root_buffer_capacity, i)
+    end
 
     # Compute a pointer to the start of the heap.
     heap_start_ptr = rootbuf_ptr + rootbuf_bytesize
@@ -408,7 +393,7 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap
         global_arena,
         GCArenaRecord(0, first_entry_ptr, C_NULL))
 
-    return GCMasterRecord(global_arena, root_buffer_capacity, sizebuf_ptr, rootbuf_ptr)
+    return GCMasterRecord(global_arena, root_buffer_capacity, fingerbuf_ptr, rootbuf_ptr)
 end
 
 # Collects garbage. This function is designed to be called by

From f560be645f6a1ec5bb681c2c3a2c3f3443289ca8 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 6 Mar 2019 20:59:41 +0100
Subject: [PATCH 035/146] Lower GC frame management intrinsics to GPU GC calls

---
 src/CUDAnative.jl      | 11 ++++---
 src/compiler/driver.jl |  6 ++--
 src/compiler/optim.jl  | 71 ++++++++++++++++++------------------------
 src/compiler/rtlib.jl  |  6 +++-
 src/device/runtime.jl  | 69 ++++++++++++++++++++++++++++++++++++++++
 src/gc.jl              | 50 +++--------------------------
 src/interrupts.jl      |  2 +-
 7 files changed, 119 insertions(+), 96 deletions(-)

diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index 38d6dd3c..85ea5ef9 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -33,15 +33,18 @@ include(joinpath("device", "llvm.jl"))
 include(joinpath("device", "runtime.jl"))
 include(joinpath("device", "libdevice.jl"))
 include(joinpath("device", "cuda_intrinsics.jl"))
-include(joinpath("device", "runtime_intrinsics.jl"))
 include(joinpath("device", "threading.jl"))
 
-include("compiler.jl")
-include("execution.jl")
+# The interrupts and GC files need to be loaded _before_ the
+# runtime intrinsics file, because some runtime intrinsics
+# depend on the GC and the GC depends on interrupts.
 include("interrupts.jl")
 include("gc.jl")
-include("reflection.jl")
+include(joinpath("device", "runtime_intrinsics.jl"))
 
+include("compiler.jl")
+include("execution.jl")
+include("reflection.jl")
 include("deprecated.jl")
 
 include("init.jl")
diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index d18bb32f..70ea33ba 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -39,7 +39,7 @@ function compile(target::Symbol, job::CompilerJob;
 end
 
 function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true,
-                 optimize::Bool=true, strip::Bool=false)
+                 optimize::Bool=true, strip::Bool=false, internalize::Bool=true)
     ## Julia IR
 
     @timeit to[] "Julia front-end" begin
@@ -86,7 +86,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true,
         end
 
         if optimize
-            kernel = @timeit to[] "optimization" optimize!(job, ir, kernel)
+            kernel = @timeit to[] "optimization" optimize!(job, ir, kernel; internalize=internalize)
         end
 
         if libraries
@@ -138,7 +138,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true,
             for dyn_job in keys(worklist)
                 # cached compilation
                 dyn_kernel_fn = get!(kernels, dyn_job) do
-                    dyn_ir, dyn_kernel = codegen(:llvm, dyn_job; optimize=optimize, strip=strip)
+                    dyn_ir, dyn_kernel = codegen(:llvm, dyn_job; optimize=optimize, strip=strip, internalize=internalize)
                     dyn_kernel_fn = LLVM.name(dyn_kernel)
                     dyn_kernel_ft = eltype(llvmtype(dyn_kernel))
                     link!(ir, dyn_ir)
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index ef42d155..4067f518 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -1,6 +1,6 @@
 # LLVM IR optimization
 
-function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
+function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; internalize::Bool=true)
     tm = machine(job.cap, triple(mod))
 
     if job.kernel
@@ -10,7 +10,9 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
     function initialize!(pm)
         add_library_info!(pm, triple(mod))
         add_transform_info!(pm, tm)
-        internalize!(pm, [LLVM.name(entry)])
+        if internalize
+            internalize!(pm, [LLVM.name(entry)])
+        end
     end
 
     global current_job
@@ -327,7 +329,6 @@ function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool
     return changed
 end
 
-
 # lower object allocations to to PTX malloc
 #
 # this is a PoC implementation that is very simple: allocate, and never free. it also runs
@@ -383,7 +384,24 @@ function eager_lower_gc_frame!(fun::LLVM.Function)
 
         @compiler_assert isempty(uses(barrier)) job
     end
+end
 
+# Visits all calls to a particular intrinsic in a given LLVM module
+# and redirects those calls to a different function.
+# Returns a Boolean that tells if any calls were actually redirected.
+function redirect_calls_to!(from::AbstractString, to, mod::LLVM.Module)::Bool
+    changed = false
+    visit_calls_to(from, mod) do call, _
+        args = collect(operands(call))[1:end-1]
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            new_call = call!(builder, to, args)
+            replace_uses!(call, new_call)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+        changed = true
+    end
     return changed
 end
 
@@ -441,8 +459,6 @@ function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module)
         ops = collect(operands(call))
         size = ops[1]
 
-        # Call the allocation function and bump the resulting pointer
-        # so the headroom sits just in front of the returned pointer.
         let builder = Builder(JuliaContext())
             position!(builder, call)
             ptr = array_alloca!(builder, T_alloca, size)
@@ -463,8 +479,6 @@ function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module)
         frame = ops[1]
         offset = ops[2]
 
-        # Call the allocation function and bump the resulting pointer
-        # so the headroom sits just in front of the returned pointer.
         let builder = Builder(JuliaContext())
             position!(builder, call)
             ptr = gep!(builder, frame, [offset])
@@ -535,31 +549,8 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
     end
 
     # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame.
-    # We'll lower it as an alloca and hope SSA construction and DCE passes
-    # get rid of the alloca. This is a reasonable thing to hope for because
-    # all intrinsics that may cause the GC frame to escape will be replaced by
-    # nops.
-    visit_calls_to("julia.new_gc_frame", mod) do call, new_gc_frame
-        new_gc_frame_ft = eltype(llvmtype(new_gc_frame))::LLVM.FunctionType
-        T_ret = return_type(new_gc_frame_ft)::LLVM.PointerType
-        T_alloca = eltype(T_ret)
-
-        # Decode the call.
-        ops = collect(operands(call))
-        size = ops[1]
-
-        # Call the allocation function and bump the resulting pointer
-        # so the headroom sits just in front of the returned pointer.
-        let builder = Builder(JuliaContext())
-            position!(builder, call)
-            ptr = array_alloca!(builder, T_alloca, size)
-            replace_uses!(call, ptr)
-            unsafe_delete!(LLVM.parent(call), call)
-            dispose(builder)
-        end
-
-        changed = true
-    end
+    # We actually have a call that implements this intrinsic. Let's use that.
+    changed |= redirect_calls_to!("julia.new_gc_frame", Runtime.get(:new_gc_frame), mod)
 
     # The 'julia.get_gc_frame_slot' is closely related to the previous
     # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of
@@ -570,8 +561,6 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
         frame = ops[1]
         offset = ops[2]
 
-        # Call the allocation function and bump the resulting pointer
-        # so the headroom sits just in front of the returned pointer.
         let builder = Builder(JuliaContext())
             position!(builder, call)
             ptr = gep!(builder, frame, [offset])
@@ -583,15 +572,15 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
         changed = true
     end
 
-    # The 'julia.push_gc_frame' registers a GC frame with the GC. We
-    # don't have a GC, so we can just delete calls to this intrinsic!
-    changed |= delete_calls_to!("julia.push_gc_frame", mod)
+    # The 'julia.push_gc_frame' registers a GC frame with the GC. We will
+    # call a function that does just this.
+    changed |= redirect_calls_to!("julia.push_gc_frame", Runtime.get(:push_gc_frame), mod)
 
-    # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so
-    # we can just delete calls to this intrinsic, too.
-    changed |= delete_calls_to!("julia.pop_gc_frame", mod)
+    # The 'julia.pop_gc_frame' unregisters a GC frame with the GC. We again
+    # have a function in the runtime library.
+    changed |= redirect_calls_to!("julia.pop_gc_frame", Runtime.get(:pop_gc_frame), mod)
 
-    # Ditto for 'julia.queue_gc_root'.
+    # Delete calls to 'julia.queue_gc_root'.
     changed |= delete_calls_to!("julia.queue_gc_root", mod)
 
     return changed
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
index 385ac218..ad82f984 100644
--- a/src/compiler/rtlib.jl
+++ b/src/compiler/rtlib.jl
@@ -124,8 +124,12 @@ end
 
 function emit_function!(mod, cap, f, types, name)
     tt = Base.to_tuple_type(types)
+    # Optimize the module that defines the function, but don't
+    # internalize symbols in that function yet: internalizing
+    # globals may de-alias references to globals in the runtime
+    # library from equivalent references in the kernel.
     new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false);
-                             libraries=false)
+                             libraries=false, internalize=false)
     LLVM.name!(entry, name)
     link!(mod, new_mod)
 end
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index a331c9ee..a8ff03a6 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -13,6 +13,7 @@ using ..CUDAnative
 using LLVM
 using LLVM.Interop
 
+import ..CUDAnative: @nocollect, ObjectRef, GCFrame, get_gc_master_record, get_thread_id, new_gc_frame_impl
 
 ## representation of a runtime method instance
 
@@ -225,6 +226,13 @@ for (T, t) in [Int8   => :int8,  Int16  => :int16,  Int32  => :int32,  Int64  =>
     end
 end
 
+# LLVM type of a pointer to a tracked pointer
+function T_pprjlvalue()
+    T_pjlvalue = convert(LLVMType, Any, true)
+    LLVM.PointerType(
+        LLVM.PointerType(eltype(T_pjlvalue), Tracked))
+end
+
 """
     gc_malloc_object(bytesize::Csize_t)
 
@@ -237,4 +245,65 @@ end
 
 compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue)
 
+"""
+    new_gc_frame(size::UInt32)::GCFrame
+
+Allocates a new GC frame.
+"""
+function new_gc_frame(size::UInt32)::GCFrame
+    @nocollect new_gc_frame_impl(size)
+end
+
+compile(new_gc_frame, Any, (Cuint,), T_pprjlvalue)
+
+"""
+    push_gc_frame(gc_frame::GCFrame, size::UInt32)
+
+Registers a GC frame with the garbage collector.
+"""
+function push_gc_frame(gc_frame::GCFrame, size::UInt32)
+    @nocollect begin
+        master_record = get_gc_master_record()
+
+        # Update the root buffer tip.
+        unsafe_store!(
+            master_record.root_buffer_fingers,
+            gc_frame + size * sizeof(ObjectRef),
+            get_thread_id())
+        return
+    end
+end
+
+compile(
+    push_gc_frame,
+    Nothing,
+    (GCFrame, Cuint),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_pprjlvalue(), convert(LLVMType, UInt32)])
+
+"""
+    pop_gc_frame(gc_frame::GCFrame)
+
+Deregisters a GC frame.
+"""
+function pop_gc_frame(gc_frame::GCFrame)
+    @nocollect begin
+        master_record = get_gc_master_record()
+
+        # Update the root buffer tip.
+        unsafe_store!(
+            master_record.root_buffer_fingers,
+            gc_frame,
+            get_thread_id())
+        return
+    end
+end
+
+compile(
+    pop_gc_frame,
+    Nothing,
+    (GCFrame,),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_pprjlvalue()])
+
 end
diff --git a/src/gc.jl b/src/gc.jl
index 660614c3..36937b32 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -69,6 +69,9 @@ end
 # A reference to a Julia object.
 const ObjectRef = Ptr{Nothing}
 
+# A GC frame is just a pointer to an array of Julia objects.
+const GCFrame = Ptr{ObjectRef}
+
 # A data structure that contains global GC info. This data
 # structure is designed to be immutable: it should not be changed
 # once the host has set it up.
@@ -118,58 +121,13 @@ end
     return threadIdx().x
 end
 
-const GCFrame = Ptr{ObjectRef}
-
-# Same as 'new_gc_frame_impl', but does not disable collections.
+# Same as 'new_gc_frame', but does not disable collections.
 function new_gc_frame_impl(size::UInt32)::GCFrame
     master_record = get_gc_master_record()
     # Return the root buffer tip: that's where the new GC frame starts.
     return unsafe_load(master_record.root_buffer_fingers, get_thread_id())
 end
 
-"""
-    new_gc_frame(size::UInt32)::Ptr{ObjectRef}
-
-Allocates a new GC frame.
-"""
-function new_gc_frame(size::UInt32)::GCFrame
-    @nocollect new_gc_frame_impl(size)
-end
-
-"""
-    push_gc_frame(gc_frame::GCFrame, size::UInt32)
-
-Registers a GC frame with the garbage collector.
-"""
-function push_gc_frame(gc_frame::GCFrame, size::UInt32)
-    @nocollect begin
-        master_record = get_gc_master_record()
-
-        # Update the root buffer tip.
-        unsafe_store!(
-            master_record.root_buffer_fingers,
-            gc_frame + size * sizeof(ObjectRef),
-            get_thread_id())
-    end
-end
-
-"""
-    pop_gc_frame(gc_frame::GCFrame)
-
-Deregisters a GC frame.
-"""
-function pop_gc_frame(gc_frame::GCFrame)
-    @nocollect begin
-        master_record = get_gc_master_record()
-
-        # Update the root buffer tip.
-        unsafe_store!(
-            master_record.root_buffer_fingers,
-            gc_frame,
-            get_thread_id())
-    end
-end
-
 const gc_align = Csize_t(16)
 
 # Aligns a pointer to an alignment boundary.
diff --git a/src/interrupts.jl b/src/interrupts.jl
index d70c2773..be60697e 100644
--- a/src/interrupts.jl
+++ b/src/interrupts.jl
@@ -71,7 +71,7 @@ end
         # If the global hasn't been defined already, then we'll define
         # it in the global address space, i.e., address space one.
         global_var = GlobalVariable(mod, T_global, global_name_string, 1)
-        linkage!(global_var, LLVM.API.LLVMLinkOnceODRLinkage)
+        linkage!(global_var, LLVM.API.LLVMLinkOnceAnyLinkage)
         initializer!(global_var, LLVM.null(T_global))
     end
 

From 53db509d379308256b1d0d2ee1e24e441b2fd5d8 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 7 Mar 2019 11:29:41 +0100
Subject: [PATCH 036/146] Allow GC frame management functions to execute
 concurrently with the GC

---
 src/compiler/optim.jl |  2 +-
 src/device/runtime.jl | 38 +++++++++++++++++---------------------
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 4067f518..f798f96e 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -392,7 +392,7 @@ end
 function redirect_calls_to!(from::AbstractString, to, mod::LLVM.Module)::Bool
     changed = false
     visit_calls_to(from, mod) do call, _
-        args = collect(operands(call))[1:end-1]
+        args = collect(operands(call))[1:end - 1]
         let builder = Builder(JuliaContext())
             position!(builder, call)
             new_call = call!(builder, to, args)
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index a8ff03a6..91dab063 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -251,7 +251,7 @@ compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue)
 Allocates a new GC frame.
 """
 function new_gc_frame(size::UInt32)::GCFrame
-    @nocollect new_gc_frame_impl(size)
+    new_gc_frame_impl(size)
 end
 
 compile(new_gc_frame, Any, (Cuint,), T_pprjlvalue)
@@ -262,16 +262,14 @@ compile(new_gc_frame, Any, (Cuint,), T_pprjlvalue)
 Registers a GC frame with the garbage collector.
 """
 function push_gc_frame(gc_frame::GCFrame, size::UInt32)
-    @nocollect begin
-        master_record = get_gc_master_record()
-
-        # Update the root buffer tip.
-        unsafe_store!(
-            master_record.root_buffer_fingers,
-            gc_frame + size * sizeof(ObjectRef),
-            get_thread_id())
-        return
-    end
+    master_record = get_gc_master_record()
+
+    # Update the root buffer tip.
+    unsafe_store!(
+        master_record.root_buffer_fingers,
+        gc_frame + size * sizeof(ObjectRef),
+        get_thread_id())
+    return
 end
 
 compile(
@@ -287,16 +285,14 @@ compile(
 Deregisters a GC frame.
 """
 function pop_gc_frame(gc_frame::GCFrame)
-    @nocollect begin
-        master_record = get_gc_master_record()
-
-        # Update the root buffer tip.
-        unsafe_store!(
-            master_record.root_buffer_fingers,
-            gc_frame,
-            get_thread_id())
-        return
-    end
+    master_record = get_gc_master_record()
+
+    # Update the root buffer tip.
+    unsafe_store!(
+        master_record.root_buffer_fingers,
+        gc_frame,
+        get_thread_id())
+    return
 end
 
 compile(

From 358ceaea49133265796c28fb6a644234fd480aa3 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 7 Mar 2019 11:46:42 +0100
Subject: [PATCH 037/146] Move GC frame management functions into 'gc.jl'

---
 src/device/runtime.jl | 51 +++++--------------------------------------
 src/gc.jl             | 42 ++++++++++++++++++++++++++++++++---
 2 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 91dab063..e456c356 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -13,8 +13,7 @@ using ..CUDAnative
 using LLVM
 using LLVM.Interop
 
-import ..CUDAnative: @nocollect, ObjectRef, GCFrame, get_gc_master_record, get_thread_id, new_gc_frame_impl
-
+import ..CUDAnative: GCFrame
 ## representation of a runtime method instance
 
 struct RuntimeMethodInstance
@@ -245,58 +244,18 @@ end
 
 compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue)
 
-"""
-    new_gc_frame(size::UInt32)::GCFrame
-
-Allocates a new GC frame.
-"""
-function new_gc_frame(size::UInt32)::GCFrame
-    new_gc_frame_impl(size)
-end
-
-compile(new_gc_frame, Any, (Cuint,), T_pprjlvalue)
-
-"""
-    push_gc_frame(gc_frame::GCFrame, size::UInt32)
-
-Registers a GC frame with the garbage collector.
-"""
-function push_gc_frame(gc_frame::GCFrame, size::UInt32)
-    master_record = get_gc_master_record()
-
-    # Update the root buffer tip.
-    unsafe_store!(
-        master_record.root_buffer_fingers,
-        gc_frame + size * sizeof(ObjectRef),
-        get_thread_id())
-    return
-end
+# Include GC frame management functions into the runtime.
+compile(CUDAnative.new_gc_frame, Any, (Cuint,), T_pprjlvalue)
 
 compile(
-    push_gc_frame,
+    CUDAnative.push_gc_frame,
     Nothing,
     (GCFrame, Cuint),
     () -> convert(LLVMType, Cvoid),
     () -> [T_pprjlvalue(), convert(LLVMType, UInt32)])
 
-"""
-    pop_gc_frame(gc_frame::GCFrame)
-
-Deregisters a GC frame.
-"""
-function pop_gc_frame(gc_frame::GCFrame)
-    master_record = get_gc_master_record()
-
-    # Update the root buffer tip.
-    unsafe_store!(
-        master_record.root_buffer_fingers,
-        gc_frame,
-        get_thread_id())
-    return
-end
-
 compile(
-    pop_gc_frame,
+    CUDAnative.pop_gc_frame,
     Nothing,
     (GCFrame,),
     () -> convert(LLVMType, Cvoid),
diff --git a/src/gc.jl b/src/gc.jl
index 36937b32..61b12f86 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -121,13 +121,49 @@ end
     return threadIdx().x
 end
 
-# Same as 'new_gc_frame', but does not disable collections.
-function new_gc_frame_impl(size::UInt32)::GCFrame
+"""
+    new_gc_frame(size::UInt32)::GCFrame
+
+Allocates a new GC frame.
+"""
+function new_gc_frame(size::UInt32)::GCFrame
     master_record = get_gc_master_record()
     # Return the root buffer tip: that's where the new GC frame starts.
     return unsafe_load(master_record.root_buffer_fingers, get_thread_id())
 end
 
+"""
+    push_gc_frame(gc_frame::GCFrame, size::UInt32)
+
+Registers a GC frame with the garbage collector.
+"""
+function push_gc_frame(gc_frame::GCFrame, size::UInt32)
+    master_record = get_gc_master_record()
+
+    # Update the root buffer tip.
+    unsafe_store!(
+        master_record.root_buffer_fingers,
+        gc_frame + size * sizeof(ObjectRef),
+        get_thread_id())
+    return
+end
+
+"""
+    pop_gc_frame(gc_frame::GCFrame)
+
+Deregisters a GC frame.
+"""
+function pop_gc_frame(gc_frame::GCFrame)
+    master_record = get_gc_master_record()
+
+    # Update the root buffer tip.
+    unsafe_store!(
+        master_record.root_buffer_fingers,
+        gc_frame,
+        get_thread_id())
+    return
+end
+
 const gc_align = Csize_t(16)
 
 # Aligns a pointer to an alignment boundary.
@@ -261,7 +297,7 @@ function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt
         # won't get collected by the GC before the caller has a chance to add it to its
         # own GC frame.
         if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL)
-            gc_frame = new_gc_frame_impl(UInt32(1))
+            gc_frame = new_gc_frame(UInt32(1))
             unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr))
         end
         return result_ptr

From ecc601d709feb878c72df688fc09324d43b0ebad Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 7 Mar 2019 11:47:43 +0100
Subject: [PATCH 038/146] Mark GC frame management functions as '@inline'

---
 src/gc.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 61b12f86..7d7ed8e1 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -126,7 +126,7 @@ end
 
 Allocates a new GC frame.
 """
-function new_gc_frame(size::UInt32)::GCFrame
+@inline function new_gc_frame(size::UInt32)::GCFrame
     master_record = get_gc_master_record()
     # Return the root buffer tip: that's where the new GC frame starts.
     return unsafe_load(master_record.root_buffer_fingers, get_thread_id())
@@ -137,7 +137,7 @@ end
 
 Registers a GC frame with the garbage collector.
 """
-function push_gc_frame(gc_frame::GCFrame, size::UInt32)
+@inline function push_gc_frame(gc_frame::GCFrame, size::UInt32)
     master_record = get_gc_master_record()
 
     # Update the root buffer tip.
@@ -153,7 +153,7 @@ end
 
 Deregisters a GC frame.
 """
-function pop_gc_frame(gc_frame::GCFrame)
+@inline function pop_gc_frame(gc_frame::GCFrame)
     master_record = get_gc_master_record()
 
     # Update the root buffer tip.

From dcec58d56f9037be94d46c73bd9cc3251ea09ede Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 7 Mar 2019 12:11:37 +0100
Subject: [PATCH 039/146] Update 'get_thread_id' to take blocks into account

---
 src/gc.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gc.jl b/src/gc.jl
index 7d7ed8e1..2aab537f 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -118,7 +118,7 @@ end
 
 # Gets the thread ID of the current thread.
 @inline function get_thread_id()
-    return threadIdx().x
+    return (blockIdx().x - 1) * blockDim().x + threadIdx().x
 end
 
 """

From f198cf86ec768fa1100a597ccf38f189d7a8daaf Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 7 Mar 2019 13:01:08 +0100
Subject: [PATCH 040/146] Introduce GC heap management data structures

---
 src/gc.jl         | 99 +++++++++++++++++++++++++++++++++++++++++------
 src/interrupts.jl | 32 +++++++++------
 2 files changed, 109 insertions(+), 22 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 2aab537f..edd94cd0 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -355,10 +355,36 @@ const initial_gc_heap_size = 16 * (1 << 20)
 # 256 roots. That's 2 KiB of roots per thread.
 const default_root_buffer_capacity = 256
 
-# Initializes GC memory and produces a master record.
-function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord
-    gc_memory_start_ptr = pointer(buffer, 1)
-    gc_memory_end_ptr = pointer(buffer, length(buffer))
+# A description of a region of memory that has been allocated to the GC heap.
+struct GCHeapRegion
+    # A buffer that contains the GC region's bytes.
+    buffer::Array{UInt8, 1}
+    # A pointer to the first element in the region.
+    start::Ptr{UInt8}
+    # The region's size in bytes.
+    size::Csize_t
+end
+
+GCHeapRegion(buffer::Array{UInt8, 1}) = GCHeapRegion(buffer, pointer(buffer, 1), Csize_t(length(buffer)))
+
+# A description of all memory that has been allocated to the GC heap.
+struct GCHeapDescription
+    # A list of the set of regions that comprise the GC heap.
+    regions::Array{GCHeapRegion, 1}
+end
+
+GCHeapDescription() = GCHeapDescription([])
+
+# Initializes a GC heap and produces a master record.
+function gc_init!(
+    heap::GCHeapDescription,
+    thread_count::Integer;
+    root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord
+
+    master_region = heap.regions[1]
+
+    gc_memory_start_ptr = master_region.start
+    gc_memory_end_ptr = master_region.start + master_region.size
 
     # Set up root buffers.
     fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
@@ -390,9 +416,58 @@ function gc_init(buffer::Array{UInt8, 1}, thread_count::Integer; root_buffer_cap
     return GCMasterRecord(global_arena, root_buffer_capacity, fingerbuf_ptr, rootbuf_ptr)
 end
 
+# Tells if a GC heap contains a particular pointer.
+function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T
+    for region in heap.regions
+        if pointer >= region.start && pointer < region.start + region.size
+            return true
+        end
+    end
+    return false
+end
+
+# Expands the GC heap by allocating a region of memory and adding it to
+# the list of allocated regions. `size` describes the amount of bytes to
+# allocate. Returns the allocated region.
+function expand!(heap::GCHeapDescription, size::Integer)::GCHeapRegion
+    buffer = alloc_shared_array((size,), UInt8(0))
+    region = GCHeapRegion(buffer)
+    push!(heap.regions, region)
+    return region
+end
+
+# Frees all memory allocated by a GC heap.
+function free!(heap::GCHeapDescription)
+    for region in heap.regions
+        free_shared_array(region.buffer)
+    end
+end
+
 # Collects garbage. This function is designed to be called by
 # the host, not by the device.
-function gc_collect_impl(master_record::GCMasterRecord)
+function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
+
+    # The Julia CPU GC is precise and the information it uses for precise
+    # garbage collection is stored in memory that we should be able to access.
+    # However, the way the CPU GC stores field information is incredibly
+    # complicated and replicating that logic here would be a royal pain to
+    # implement and maintain. Ideally, the CPU GC would expose an interface that
+    # allows us to point to an object and ask the GC for all GC-tracked pointers
+    # it contains. Alas, no such luck: the CPU GC doesn't even have an internal
+    # function that does that. The CPU GC's logic for finding GC-tracked pointer
+    # fields is instead fused tightly with its 'mark' loop.
+    #
+    # To cope with this, we will simply implement a conservative GC: we precisely
+    # scan the roots for pointers into the GC heap. We then recursively mark blocks
+    # that are pointed to by such pointers as live and conservatively scan them for
+    # more pointers.
+    #
+    # A conservative GC is fairly simple: we maintain a worklist of pointers that
+    # are live and may need to be processed, as well as a set of pointers that are
+    # live and have already been processed.
+    live_pointers = Set{ObjectRef}()
+    live_worklist = []
+
     println("GC collections are not implemented yet.")
 end
 
@@ -453,12 +528,14 @@ macro cuda_gc(ex...)
         quote
             GC.@preserve $(vars...) begin
                 # Define a trivial buffer that contains the interrupt state.
-                local host_interrupt_array, device_interrupt_buffer = alloc_shared_array((1,), ready)
+                local host_interrupt_array = alloc_shared_array((1,), ready)
+                local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array)
 
                 # Allocate a shared buffer for GC memory.
                 local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_capacity * $(esc(thread_count))
-                local host_gc_array, device_gc_buffer = alloc_shared_array((gc_memory_size,), UInt8(0))
-                local master_record = gc_init(host_gc_array, $(esc(thread_count)))
+                local gc_heap = GCHeapDescription()
+                expand!(gc_heap, gc_memory_size)
+                local master_record = gc_init!(gc_heap, $(esc(thread_count)))
 
                 # Define a kernel initialization function.
                 local function kernel_init(kernel)
@@ -488,7 +565,7 @@ macro cuda_gc(ex...)
                 end
 
                 local function handle_interrupt()
-                    gc_collect_impl(master_record)
+                    gc_collect_impl(master_record, gc_heap)
                 end
 
                 try
@@ -502,8 +579,8 @@ macro cuda_gc(ex...)
                     # Handle interrupts.
                     handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream)))
                 finally
-                    free_shared_array(device_interrupt_buffer)
-                    free_shared_array(device_gc_buffer)
+                    free_shared_array(host_interrupt_array)
+                    free!(gc_heap)
                 end
             end
          end)
diff --git a/src/interrupts.jl b/src/interrupts.jl
index be60697e..7793b42d 100644
--- a/src/interrupts.jl
+++ b/src/interrupts.jl
@@ -11,9 +11,9 @@ export @cuda_interruptible, interrupt, interrupt_or_wait, wait_for_interrupt
 
 # Allocates an array of host memory that is page-locked and accessible
 # to the device. Maps the allocation into the CUDA address space.
-# Returns a (host array, device buffer) pair. The former can be used by
-# the host to access the array, the latter can be used by the device.
-function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
+# Returns a host array that can be turned into a device array by calling
+# the `get_shared_device_buffer` function.
+function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T)::Array{T, N} where {T, N}
     # Allocate memory that is accessible to both the host and the device.
     bytesize = prod(dims) * sizeof(T)
     ptr_ref = Ref{Ptr{Cvoid}}()
@@ -22,20 +22,29 @@ function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T) where {T, N}
         (Ptr{Ptr{Cvoid}}, Csize_t),
         ptr_ref, bytesize)
 
-    device_buffer = CUDAdrv.Mem.Buffer(convert(CuPtr{T}, convert(Csize_t, ptr_ref[])), bytesize, CuCurrentContext())
-
-    # Wrap the memory in an array for the host.
+    # Wrap the memory in an array.
     host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false)
 
     # Initialize the array's contents.
     fill!(host_array, init)
 
-    return host_array, device_buffer
+    return host_array
+end
+
+# Gets the device array that corresponds to a shared host array.
+# NOTE: this function only works for arrays that were allocated by
+# `alloc_shared_array`. It has undefined behavior for all other arrays.
+function get_shared_device_buffer(shared_array::Array{T, N})::Mem.Buffer where {T, N}
+    bytesize = length(shared_array) * sizeof(T)
+    CUDAdrv.Mem.Buffer(
+        convert(CuPtr{T}, convert(Csize_t, pointer(shared_array, 1))),
+        bytesize,
+        CuCurrentContext())
 end
 
 # Frees an array of host memory.
-function free_shared_array(buffer::Mem.Buffer)
-    ptr = convert(Ptr{Cvoid}, convert(Csize_t, buffer.ptr))
+function free_shared_array(shared_array::Array{T, N}) where {T, N}
+    ptr = pointer(shared_array, 1)
     @apicall(
         :cuMemFreeHost,
         (Ptr{Cvoid},),
@@ -233,7 +242,8 @@ macro cuda_interruptible(handler, ex...)
         quote
             GC.@preserve $(vars...) begin
                 # Define a trivial buffer that contains the interrupt state.
-                local host_array, device_buffer = alloc_shared_array((1,), ready)
+                local host_array = alloc_shared_array((1,), ready)
+                local device_buffer = get_shared_device_buffer(host_array)
 
                 try
                     # Define a kernel initialization function that sets the
@@ -261,7 +271,7 @@ macro cuda_interruptible(handler, ex...)
                     # Handle interrupts.
                     handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream)))
                 finally
-                    free_shared_array(device_buffer)
+                    free_shared_array(host_array)
                 end
             end
          end)

From d039839360027d71c7ef5302be0d5498bc25bcb5 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 7 Mar 2019 16:46:30 +0100
Subject: [PATCH 041/146] Implement the mark & sweep phases of the GC

---
 src/gc.jl | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 189 insertions(+), 14 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index edd94cd0..486a705a 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -26,6 +26,8 @@
 
 export @cuda_gc, gc_malloc, gc_collect
 
+import Base: length
+
 # A data structure that precedes every chunk of memory that has been
 # allocated or put into the free list.
 struct GCAllocationRecord
@@ -40,6 +42,11 @@ struct GCAllocationRecord
     next::Ptr{GCAllocationRecord}
 end
 
+# Gets a pointer to the first byte of data managed by an allocation record.
+function data_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8}
+    Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(GCAllocationRecord)
+end
+
 @generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name}
     index = Base.fieldindex(TBase, field_name)
     offset = Base.fieldoffset(TBase, index)
@@ -83,6 +90,9 @@ struct GCMasterRecord
     # of roots per thread.
     root_buffer_capacity::UInt32
 
+    # The number of threads.
+    thread_count::UInt32
+
     # A pointer to a list of root buffer pointers that point to the
     # end of the root buffer for every thread.
     root_buffer_fingers::Ptr{Ptr{ObjectRef}}
@@ -92,6 +102,11 @@ struct GCMasterRecord
     root_buffers::Ptr{ObjectRef}
 end
 
+# Iterates through all arena pointers stored in a GC master record.
+@inline function iterate_arenas(fun::Function, master_record::GCMasterRecord)
+    fun(master_record.global_arena)
+end
+
 # Gets the global GC interrupt lock.
 @inline function get_interrupt_lock()::ReaderWriterLock
     return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState))
@@ -195,7 +210,7 @@ function gc_use_free_list_entry(
     # to create a new entry from any unused memory in the entry.
 
     # Compute the address to return.
-    data_address = Base.unsafe_convert(Ptr{UInt8}, entry) + sizeof(GCAllocationRecord)
+    data_address = data_pointer(entry)
 
     # Compute the end of the free memory chunk.
     end_address = data_address + entry_data.size
@@ -207,11 +222,12 @@ function gc_use_free_list_entry(
     new_entry_address = new_data_address - sizeof(GCAllocationRecord)
     if new_entry_address < data_address + bytesize
         new_entry_address += gc_align
+        new_data_address += gc_align
     end
 
     # If we can place a new entry just past the allocation, then we should
     # by all means do so.
-    if new_entry_address + sizeof(GCAllocationRecord) < end_address
+    if new_data_address < end_address
         # Create a new free list entry.
         new_entry_size = Csize_t(end_address) - Csize_t(new_data_address)
         new_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, new_entry_address)
@@ -329,11 +345,39 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     end
 
     # Alright, so that was a spectacular failure. Let's just throw an exception.
-    @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", bytesize)
+    @cuprintf("ERROR: Out of GPU GC memory (trying to allocate %i bytes)\n", bytesize)
     # throw(OutOfMemoryError())
     return C_NULL
 end
 
+# Tries to free a block of memory from a particular arena. `record_ptr`
+# must point to a pointer to the GC allocation record to free. It will
+# be updated to point to the next allocation.
+#
+# This function is designed to be called by the host: it does not
+# turn off collections. It can be called by the device, but in that
+# case it should be prefixed by the `@nocollect` macro followed by
+# a write lock acquisition on the arena's lock.
+function gc_free_local_impl(
+    arena::Ptr{GCArenaRecord},
+    record_ptr::Ptr{Ptr{GCAllocationRecord}})
+
+    record = unsafe_load(record_ptr)
+    next_record_ptr = @get_field_pointer(record, :next)
+    free_list_head_ptr = @get_field_pointer(arena, :free_list_head)
+
+    # Remove the record from the allocation list.
+    next_record = unsafe_load(next_record_ptr)
+    unsafe_store!(record_ptr, next_record)
+
+    println("Freeing $(unsafe_load(record).size) bytes at $(data_pointer(record))")
+
+    # Add the record to the free list and update its `next` pointer
+    # (but not in that order).
+    unsafe_store!(next_record_ptr, unsafe_load(free_list_head_ptr))
+    unsafe_store!(free_list_head_ptr, record)
+end
+
 """
     gc_collect()
 
@@ -394,7 +438,7 @@ function gc_init!(
 
     # Populate the root buffer fingers.
     for i in 1:thread_count
-        unsafe_store!(fingerbuf_ptr, rootbuf_ptr + i * sizeof(ObjectRef) * root_buffer_capacity, i)
+        unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * root_buffer_capacity, i)
     end
 
     # Compute a pointer to the start of the heap.
@@ -413,7 +457,7 @@ function gc_init!(
         global_arena,
         GCArenaRecord(0, first_entry_ptr, C_NULL))
 
-    return GCMasterRecord(global_arena, root_buffer_capacity, fingerbuf_ptr, rootbuf_ptr)
+    return GCMasterRecord(global_arena, root_buffer_capacity, UInt32(thread_count), fingerbuf_ptr, rootbuf_ptr)
 end
 
 # Tells if a GC heap contains a particular pointer.
@@ -443,10 +487,89 @@ function free!(heap::GCHeapDescription)
     end
 end
 
-# Collects garbage. This function is designed to be called by
-# the host, not by the device.
-function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
+# A sorted list of all allocation records for allocated blocks.
+# This data structure is primarily useful for rapidly mapping
+# pointers to the blocks allocated blocks that contain them.
+struct SortedAllocationList
+    # An array of pointers to allocation records. The pointers
+    # are all sorted.
+    records::Array{Ptr{GCAllocationRecord}, 1}
+end
+
+length(alloc_list::SortedAllocationList) = length(alloc_list.records)
+
+# Gets a pointer to the allocation record that manages the memory
+# pointed to by `pointer`. Returns a null pointer if there is no
+# such record.
+function get_record(
+    alloc_list::SortedAllocationList,
+    pointer::Ptr{T})::Ptr{GCAllocationRecord} where T
 
+    cast_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, pointer)
+
+    # Deal with the most common cases quickly.
+    if length(alloc_list) == 0 ||
+        pointer < data_pointer(alloc_list.records[1]) ||
+        pointer > data_pointer(alloc_list.records[end]) + Base.unsafe_load(alloc_list.records[end]).size
+
+        return C_NULL
+    end
+
+    # To do this lookup quickly, we will do a binary search for the
+    # biggest allocation record pointer that is smaller than `pointer`.
+    range_start, range_end = 1, length(alloc_list)
+    while range_end - range_start > 1 
+        range_mid = div(range_start + range_end, 2)
+        mid_val = alloc_list.records[range_mid]
+        if mid_val > cast_ptr
+            range_end = range_mid
+        else
+            range_start = range_mid
+        end
+    end
+
+    record = alloc_list.records[range_end]
+    if record >= cast_ptr
+        record = alloc_list.records[range_start]
+    end
+
+    # Make sure that the pointer actually points to a region of memory
+    # that is managed by the candidate record we found.
+    record_data_pointer = data_pointer(record)
+    if cast_ptr >= record_data_pointer && cast_ptr < record_data_pointer + unsafe_load(record).size
+        return record
+    else
+        return C_NULL
+    end
+end
+
+# Iterates through a linked list of allocation records and apply a function
+# to every node in the linked list. The function is allowed to modify allocation
+# records.
+@inline function iterate_allocation_records(fun::Function, head::Ptr{GCAllocationRecord})
+    while head != C_NULL
+        fun(head)
+        head = unsafe_load(head).next
+    end
+end
+
+# Takes a GC master record and constructs a sorted allocation list
+# based on it.
+function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationList
+    records = []
+    iterate_arenas(master_record) do arena
+        allocation_list_head = unsafe_load(arena).allocation_list_head
+        iterate_allocation_records(allocation_list_head) do record
+            push!(records, record)
+        end
+    end
+    sort!(records)
+    return SortedAllocationList(records)
+end
+
+# Collects garbage. This function is designed to be called by the host,
+# not by the device.
+function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
     # The Julia CPU GC is precise and the information it uses for precise
     # garbage collection is stored in memory that we should be able to access.
     # However, the way the CPU GC stores field information is incredibly
@@ -457,18 +580,70 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
     # function that does that. The CPU GC's logic for finding GC-tracked pointer
     # fields is instead fused tightly with its 'mark' loop.
     #
-    # To cope with this, we will simply implement a conservative GC: we precisely
+    # To cope with this, we will simply implement a semi-conservative GC: we precisely
     # scan the roots for pointers into the GC heap. We then recursively mark blocks
     # that are pointed to by such pointers as live and conservatively scan them for
     # more pointers.
     #
-    # A conservative GC is fairly simple: we maintain a worklist of pointers that
-    # are live and may need to be processed, as well as a set of pointers that are
+    # Our mark phase is fairly simple: we maintain a worklist of pointers that
+    # are live and may need to be processed, as well as a set of blocks that are
     # live and have already been processed.
-    live_pointers = Set{ObjectRef}()
-    live_worklist = []
+    live_blocks = Set{Ptr{GCAllocationRecord}}()
+    live_worklist = Ptr{ObjectRef}[]
+
+    # Get a sorted allocation list, which will allow us to classify live pointers quickly.
+    alloc_list = sort_allocation_list(master_record)
 
-    println("GC collections are not implemented yet.")
+    # Add all roots to the worklist.
+    for i in 1:(master_record.root_buffer_capacity * master_record.thread_count)
+        root = unsafe_load(master_record.root_buffers, i)
+        if root != C_NULL
+            push!(live_worklist, root)
+        end
+    end
+
+    # Now process all live pointers until we reach a fixpoint.
+    while !isempty(live_worklist)
+        # Pop a pointer from the worklist.
+        object_ref = pop!(live_worklist)
+        # Get the block for that pointer.
+        record = get_record(alloc_list, object_ref)
+        # Make sure that we haven't visited the block yet.
+        if record != C_NULL && !(record in live_blocks)
+            # Mark the block as live.
+            push!(live_blocks, record)
+            # Add all pointer-sized, aligned values to the live pointer worklist.
+            block_pointer = data_pointer(record)
+            block_size = unsafe_load(record).size
+            for i in 0:sizeof(ObjectRef):(block_size - 1)
+                push!(live_worklist, Base.unsafe_convert(ObjectRef, block_pointer + i))
+            end
+        end
+    end
+
+    # We're done with the mark phase! Time to proceed to the sweep phase.
+    # The first thing we'll do is iterate through every arena's allocation list and
+    # free dead blocks.
+    iterate_arenas(master_record) do arena
+        record_ptr = @get_field_pointer(arena, :allocation_list_head)
+        while true
+            record = unsafe_load(record_ptr)
+            if record == C_NULL
+                # We've reached the end of the list.
+                break
+            end
+
+            if record in live_blocks
+                # We found a live block. Proceed to the next block.
+                record_ptr = @get_field_pointer(record, :next)
+            else
+                # We found a dead block. Release it. Don't proceed to the
+                # next block because the current block will change in the
+                # next iteration of this loop.
+                gc_free_local_impl(arena, record_ptr)
+            end
+        end
+    end
 end
 
 # Examines a keyword argument list and gets either the value

From 7358f9cbee01e345a4729220f82fe02d67f22b85 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 7 Mar 2019 19:00:19 +0100
Subject: [PATCH 042/146] Implement a free list compaction and extra memory
 allocation scheme

---
 src/gc.jl | 127 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 114 insertions(+), 13 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 486a705a..62e484ba 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -42,11 +42,6 @@ struct GCAllocationRecord
     next::Ptr{GCAllocationRecord}
 end
 
-# Gets a pointer to the first byte of data managed by an allocation record.
-function data_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8}
-    Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(GCAllocationRecord)
-end
-
 @generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name}
     index = Base.fieldindex(TBase, field_name)
     offset = Base.fieldoffset(TBase, index)
@@ -59,6 +54,16 @@ macro get_field_pointer(base_pointer, field_name)
     :(get_field_pointer_impl($(esc(base_pointer)), Val($field_name)))
 end
 
+# Gets a pointer to the first byte of data managed by an allocation record.
+function data_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8}
+    Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(GCAllocationRecord)
+end
+
+# Gets a pointer to the first byte of data no longer managed by an allocation record.
+function data_end_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8}
+    data_pointer(record) + unsafe_load(@get_field_pointer(record, :size))
+end
+
 # A data structure that describes a single GC "arena", i.e.,
 # a section of the heap that is managed by the GC. Every arena
 # has its own free list and allocation list.
@@ -350,6 +355,16 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     return C_NULL
 end
 
+# Zero-fills a range of memory.
+function zero_fill!(start_ptr::Ptr{UInt8}, size::Csize_t)
+    ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), start_ptr, 0, size)
+end
+
+# Zero-fills a range of memory.
+function zero_fill!(start_ptr::Ptr{UInt8}, end_ptr::Ptr{UInt8})
+    zero_fill!(start_ptr, Csize_t(end_ptr) - Csize_t(start_ptr))
+end
+
 # Tries to free a block of memory from a particular arena. `record_ptr`
 # must point to a pointer to the GC allocation record to free. It will
 # be updated to point to the next allocation.
@@ -370,12 +385,13 @@ function gc_free_local_impl(
     next_record = unsafe_load(next_record_ptr)
     unsafe_store!(record_ptr, next_record)
 
-    println("Freeing $(unsafe_load(record).size) bytes at $(data_pointer(record))")
-
     # Add the record to the free list and update its `next` pointer
     # (but not in that order).
     unsafe_store!(next_record_ptr, unsafe_load(free_list_head_ptr))
     unsafe_store!(free_list_head_ptr, record)
+
+    # Zero-fill the newly freed block of memory.
+    zero_fill!(data_pointer(record), unsafe_load(@get_field_pointer(record, :size)))
 end
 
 """
@@ -399,6 +415,14 @@ const initial_gc_heap_size = 16 * (1 << 20)
 # 256 roots. That's 2 KiB of roots per thread.
 const default_root_buffer_capacity = 256
 
+# The point at which an arena is deemed to be starving, i.e.,
+# it no longer contains enough memory to perform basic allocations.
+# If an arena's free byte count stays below the arena starvation
+# size after a collection phase, the collector will allocate additional
+# memory to the arena such that it is no longer starving.
+# The arena starvation limit is currently set to 4 MiB.
+const arena_starvation_limit = 4 * (1 << 20)
+
 # A description of a region of memory that has been allocated to the GC heap.
 struct GCHeapRegion
     # A buffer that contains the GC region's bytes.
@@ -443,13 +467,11 @@ function gc_init!(
 
     # Compute a pointer to the start of the heap.
     heap_start_ptr = rootbuf_ptr + rootbuf_bytesize
-    global_arena_size = Csize_t(gc_memory_end_ptr) - Csize_t(heap_start_ptr) - sizeof(GCAllocationRecord) - sizeof(GCArenaRecord)
 
     # Create a single free list entry.
-    first_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, heap_start_ptr + sizeof(GCArenaRecord))
-    unsafe_store!(
-        first_entry_ptr,
-        GCAllocationRecord(global_arena_size, C_NULL))
+    first_entry_ptr = make_gc_block!(
+        heap_start_ptr + sizeof(GCArenaRecord),
+        Csize_t(gc_memory_end_ptr) - Csize_t(heap_start_ptr) - sizeof(GCArenaRecord))
 
     # Set up the main GC data structure.
     global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, heap_start_ptr)
@@ -460,6 +482,18 @@ function gc_init!(
     return GCMasterRecord(global_arena, root_buffer_capacity, UInt32(thread_count), fingerbuf_ptr, rootbuf_ptr)
 end
 
+# Takes a zero-filled region of memory and turns it into a block
+# managed by the GC, prefixed with an allocation record.
+function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCAllocationRecord} where T
+    entry = Base.unsafe_convert(Ptr{GCAllocationRecord}, start_ptr)
+    unsafe_store!(
+        entry,
+        GCAllocationRecord(
+            Csize_t(start_ptr + size) - Csize_t(data_pointer(entry)),
+            C_NULL))
+    return entry
+end
+
 # Tells if a GC heap contains a particular pointer.
 function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T
     for region in heap.regions
@@ -567,6 +601,56 @@ function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationLi
     return SortedAllocationList(records)
 end
 
+# Compact a GC arena's free list. This function will
+#   1. merge adjancent free blocks, and
+#   2. reorder free blocks to put small blocks at the front
+#      of the free list,
+#   3. tally the total number of free bytes and return that number.
+function gc_compact_free_list(arena::Ptr{GCArenaRecord})::Csize_t
+    # Let's start by creating a list of all free list records.
+    records = Ptr{GCAllocationRecord}[]
+    free_list_head = unsafe_load(arena).free_list_head
+    iterate_allocation_records(free_list_head) do record
+        push!(records, record)
+    end
+
+    # We now sort those records and loop through the sorted list,
+    # merging free list entries as we go along.
+    sort!(records)
+
+    i = 1
+    while i < length(records)
+        first_record = records[i]
+        second_record = records[i + 1]
+        if data_end_pointer(first_record) == Base.unsafe_convert(Ptr{UInt8}, second_record)
+            # We found two adjacent free list entries. Expand the first
+            # record's size to encompass both entries, zero-fill the second
+            # record's header and delete it from the list of records.
+            new_size = Csize_t(data_end_pointer(second_record)) - Csize_t(data_pointer(first_record))
+            zero_fill!(data_end_pointer(first_record), data_pointer(second_record))
+            unsafe_store!(@get_field_pointer(first_record, :size), new_size)
+            deleteat!(records, i + 1)
+        else
+            i += 1
+        end
+    end
+
+    # Now sort the records based on size. Put the smallest records first to
+    # discourage fragmentation.
+    sort!(records; lt = (x, y) -> unsafe_load(x).size < unsafe_load(y).size)
+
+    # Reconstruct the free list as a linked list.
+    prev_record_ptr = @get_field_pointer(arena, :free_list_head)
+    for record in records
+        unsafe_store!(prev_record_ptr, record)
+        prev_record_ptr = @get_field_pointer(record, :next)
+    end
+    unsafe_store!(prev_record_ptr, C_NULL)
+
+    # Compute the total number of free bytes.
+    return sum(record -> unsafe_load(record).size, records)
+end
+
 # Collects garbage. This function is designed to be called by the host,
 # not by the device.
 function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
@@ -623,7 +707,8 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
 
     # We're done with the mark phase! Time to proceed to the sweep phase.
     # The first thing we'll do is iterate through every arena's allocation list and
-    # free dead blocks.
+    # free dead blocks. Next, we will compact and reorder free lists to combat
+    # fragmentation.
     iterate_arenas(master_record) do arena
         record_ptr = @get_field_pointer(arena, :allocation_list_head)
         while true
@@ -643,6 +728,22 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
                 gc_free_local_impl(arena, record_ptr)
             end
         end
+
+        # Compact the free list.
+        free_memory = gc_compact_free_list(arena)
+
+        # If the amount of free memory in the arena is below the starvation
+        # limit then we'll expand the GC heap and add the additional memory
+        # to the arena's free list.
+        if free_memory < arena_starvation_limit
+            region = expand!(heap, arena_starvation_limit)
+            extra_record = make_gc_block!(region.start, region.size)
+            last_free_list_ptr = @get_field_pointer(arena, :free_list_head)
+            iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record
+                last_free_list_ptr = @get_field_pointer(record, :next)
+            end
+            unsafe_store!(last_free_list_ptr, extra_record)
+        end
     end
 end
 

From 457006a4c79a9c546f7c25e95d4533d30256f04c Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 11:40:05 +0100
Subject: [PATCH 043/146] Update GC docs

---
 src/gc.jl | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 62e484ba..73dbfa0c 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -1,9 +1,6 @@
 # This file contains a GC implementation for CUDAnative kernels.
-#
-# CURRENT STATE OF THE GC
-#
-# Simple memory allocation is underway. Memory allocation currently
-# uses a simple free-list.
+# The sections below contain some basic info on how the garbage
+# collector works.
 #
 # MEMORY ALLOCATION
 #
@@ -12,12 +9,18 @@
 # the allocator also maintains a list of all allocated blocks, so
 # the collector knows which blocks it can free.
 #
-# END GOAL
+# GARBAGE COLLECTION
+#
+# The garbage collector itself is a semi-conservative, non-moving,
+# mark-and-sweep GC that runs on the host. The device may trigger
+# the GC via an interrupt.
+#
+# The GC is semi-conservative in the sense that its set of roots
+# is precise but objects are scanned in an imprecise way.
 #
-# The CUDAnative GC is a precise, non-moving, mark-and-sweep GC that runs
-# on the host. The device may trigger the GC via an interrupt.
+# MISCELLANEOUS
 #
-# Some GPU-related GC implementation details:
+# Some miscellaneous GPU-related GC implementation details:
 #
 #   * GC memory is shared by the host and device.
 #   * Every thread gets a fixed region of memory for storing GC roots in.

From 0666d093de7cf4f0a2d296ff972dc138afd17720 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 11:50:42 +0100
Subject: [PATCH 044/146] Modify GC lock acquisition scheme slightly

---
 src/gc.jl | 55 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 73dbfa0c..bfc54ba8 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -306,26 +306,24 @@ end
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
 function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt8}
-    # Disable collections and acquire the arena's lock.
-    @nocollect begin
-        arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state))
-        result_ptr = writer_locked(arena_lock) do
-            # Allocate a suitable region of memory.
-            free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}}
-            allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}}
-            gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
-        end
+    # Acquire the arena's lock.
+    arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state))
+    result_ptr = writer_locked(arena_lock) do
+        # Allocate a suitable region of memory.
+        free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}}
+        allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}}
+        gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
+    end
 
-        # If the resulting pointer is non-null, then we'll write it to a temporary GC frame.
-        # Our reasoning for doing this is that doing so ensures that the allocated memory
-        # won't get collected by the GC before the caller has a chance to add it to its
-        # own GC frame.
-        if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL)
-            gc_frame = new_gc_frame(UInt32(1))
-            unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr))
-        end
-        return result_ptr
+    # If the resulting pointer is non-null, then we'll write it to a temporary GC frame.
+    # Our reasoning for doing this is that doing so ensures that the allocated memory
+    # won't get collected by the GC before the caller has a chance to add it to its
+    # own GC frame.
+    if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL)
+        gc_frame = new_gc_frame(UInt32(1))
+        unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr))
     end
+    return result_ptr
 end
 
 """
@@ -338,16 +336,18 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     master_record = get_gc_master_record()
 
     # Try to malloc the object without host intervention.
-    ptr = gc_malloc_local(master_record.global_arena, bytesize)
+    ptr = @nocollect gc_malloc_local(master_record.global_arena, bytesize)
     if ptr != C_NULL
         return ptr
     end
 
     # We're out of memory. Ask the host to step in.
-    gc_collect()
+    ptr = writer_locked(get_interrupt_lock()) do
+        gc_collect_impl()
 
-    # Try to malloc again.
-    ptr = gc_malloc_local(master_record.global_arena, bytesize)
+        # Try to malloc again.
+        gc_malloc_local(master_record.global_arena, bytesize)
+    end
     if ptr != C_NULL
         return ptr
     end
@@ -397,6 +397,12 @@ function gc_free_local_impl(
     zero_fill!(data_pointer(record), unsafe_load(@get_field_pointer(record, :size)))
 end
 
+# Like 'gc_collect', but does not acquire the interrupt lock.
+function gc_collect_impl()
+    interrupt_or_wait()
+    threadfence_system()
+end
+
 """
     gc_collect()
 
@@ -404,10 +410,7 @@ Triggers a garbage collection phase. This function is designed
 to be called by the device rather than by the host.
 """
 function gc_collect()
-    writer_locked(get_interrupt_lock()) do
-        interrupt_or_wait()
-        threadfence_system()
-    end
+    writer_locked(gc_collect_impl, get_interrupt_lock())
 end
 
 # The initial size of the GC heap, currently 16 MiB.

From 0f1ccc6a0fa777e0a28228f9252881dfed51e701 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 11:56:55 +0100
Subject: [PATCH 045/146] Avoid overly frequent garbage collections

---
 src/gc.jl | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index bfc54ba8..29726824 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -341,12 +341,25 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
         return ptr
     end
 
-    # We're out of memory. Ask the host to step in.
+    # We're out of memory, which means that we need the garbage collector
+    # to step in. Acquire the interrupt lock.
     ptr = writer_locked(get_interrupt_lock()) do
-        gc_collect_impl()
-
-        # Try to malloc again.
-        gc_malloc_local(master_record.global_arena, bytesize)
+        # Try to allocate memory again. This is bound to fail for the
+        # first thread that acquires the interrupt lock, but it is quite
+        # likely to succeed if we are *not* in the first thread that
+        # acquired the garbage collector lock.
+        ptr2 = gc_malloc_local(master_record.global_arena, bytesize)
+
+        if ptr2 == C_NULL
+            # We are either the first thread to acquire the interrupt lock
+            # or the additional memory produced by a previous collection has
+            # already been exhausted. Trigger the garbage collector.
+            gc_collect_impl()
+
+            # Try to malloc again.
+            ptr2 = gc_malloc_local(master_record.global_arena, bytesize)
+        end
+        ptr2
     end
     if ptr != C_NULL
         return ptr

From e48677eca8d5181af904e6ff5719fb38a6bba024 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 12:01:44 +0100
Subject: [PATCH 046/146] Document free list compaction

---
 src/gc.jl | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 29726824..e861d714 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -18,6 +18,15 @@
 # The GC is semi-conservative in the sense that its set of roots
 # is precise but objects are scanned in an imprecise way.
 #
+# After every garbage collection, the GC will compact free lists:
+# adjacent free list block will be merged and the free list will
+# be sorted based on block sizes to combat memory fragmentation.
+#
+# If a free list is deemed to be "starving" after a collection, i.e.,
+# its total amount of free bytes has dropped below some threshold,
+# then a fresh chunk of GC-managed memory is allocated and added to
+# the free list.
+#
 # MISCELLANEOUS
 #
 # Some miscellaneous GPU-related GC implementation details:
@@ -437,10 +446,10 @@ const default_root_buffer_capacity = 256
 # The point at which an arena is deemed to be starving, i.e.,
 # it no longer contains enough memory to perform basic allocations.
 # If an arena's free byte count stays below the arena starvation
-# size after a collection phase, the collector will allocate additional
-# memory to the arena such that it is no longer starving.
-# The arena starvation limit is currently set to 4 MiB.
-const arena_starvation_limit = 4 * (1 << 20)
+# threshold after a collection phase, the collector will allocate
+# additional memory to the arena such that it is no longer starving.
+# The arena starvation threshold is currently set to 4 MiB.
+const arena_starvation_threshold = 4 * (1 << 20)
 
 # A description of a region of memory that has been allocated to the GC heap.
 struct GCHeapRegion
@@ -754,8 +763,8 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
         # If the amount of free memory in the arena is below the starvation
         # limit then we'll expand the GC heap and add the additional memory
         # to the arena's free list.
-        if free_memory < arena_starvation_limit
-            region = expand!(heap, arena_starvation_limit)
+        if free_memory < arena_starvation_threshold
+            region = expand!(heap, arena_starvation_threshold)
             extra_record = make_gc_block!(region.start, region.size)
             last_free_list_ptr = @get_field_pointer(arena, :free_list_head)
             iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record

From 3ec9f48f22b21e9ba2a0062b4cc596dc41f50afd Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 12:22:45 +0100
Subject: [PATCH 047/146] Reserve a buffer for safepoints

---
 src/gc.jl | 42 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index e861d714..c6bc3d0c 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -12,8 +12,8 @@
 # GARBAGE COLLECTION
 #
 # The garbage collector itself is a semi-conservative, non-moving,
-# mark-and-sweep GC that runs on the host. The device may trigger
-# the GC via an interrupt.
+# mark-and-sweep, stop-the-world GC that runs on the host.
+# The device may trigger the GC via an interrupt.
 #
 # The GC is semi-conservative in the sense that its set of roots
 # is precise but objects are scanned in an imprecise way.
@@ -27,6 +27,13 @@
 # then a fresh chunk of GC-managed memory is allocated and added to
 # the free list.
 #
+# SAFEPOINTS
+#
+# Every warp gets a flag that tells if that warp is in a safepoint.
+# When a collection is triggered, the collector waits for every warp
+# to reach a safepoint. The warps indicate that they have reached a
+# safepoint by setting the flag.
+#
 # MISCELLANEOUS
 #
 # Some miscellaneous GPU-related GC implementation details:
@@ -103,12 +110,19 @@ struct GCMasterRecord
     # A pointer to the global GC arena.
     global_arena::Ptr{GCArenaRecord}
 
+    # The number of warps.
+    warp_count::UInt32
+
+    # The number of threads.
+    thread_count::UInt32
+
     # The maximum size of a GC root buffer, i.e., the maximum number
     # of roots per thread.
     root_buffer_capacity::UInt32
 
-    # The number of threads.
-    thread_count::UInt32
+    # A pointer to a list of safepoint flags. Every warp has its
+    # own flag.
+    safepoint_flags::Ptr{UInt8}
 
     # A pointer to a list of root buffer pointers that point to the
     # end of the root buffer for every thread.
@@ -475,16 +489,25 @@ GCHeapDescription() = GCHeapDescription([])
 function gc_init!(
     heap::GCHeapDescription,
     thread_count::Integer;
+    warp_count::Union{Integer, Nothing} = nothing,
     root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord
 
+    if warp_count == nothing
+        warp_count = thread_count / CUDAdrv.warpsize(device())
+    end
+
     master_region = heap.regions[1]
 
     gc_memory_start_ptr = master_region.start
     gc_memory_end_ptr = master_region.start + master_region.size
 
+    # Set up the safepoint flag buffer.
+    safepoint_bytesize = sizeof(UInt8) * warp_count
+    safepoint_ptr = Base.unsafe_convert(Ptr{UInt8}, gc_memory_start_ptr)
+
     # Set up root buffers.
     fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
-    fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, gc_memory_start_ptr)
+    fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, safepoint_ptr + fingerbuf_bytesize)
     rootbuf_bytesize = sizeof(ObjectRef) * root_buffer_capacity * thread_count
     rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, fingerbuf_ptr + fingerbuf_bytesize)
 
@@ -507,7 +530,14 @@ function gc_init!(
         global_arena,
         GCArenaRecord(0, first_entry_ptr, C_NULL))
 
-    return GCMasterRecord(global_arena, root_buffer_capacity, UInt32(thread_count), fingerbuf_ptr, rootbuf_ptr)
+    return GCMasterRecord(
+        global_arena,
+        UInt32(warp_count),
+        UInt32(thread_count),
+        root_buffer_capacity,
+        safepoint_ptr,
+        fingerbuf_ptr,
+        rootbuf_ptr)
 end
 
 # Takes a zero-filled region of memory and turns it into a block

From ef90bb438ac5c1decb76e5fcd23fddc612a7cb7a Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 12:34:01 +0100
Subject: [PATCH 048/146] Implement a safepoint function

---
 src/gc.jl         | 28 ++++++++++++++++++++++++----
 src/interrupts.jl | 17 +++++++++++++++--
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index c6bc3d0c..bd991a3b 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -43,7 +43,7 @@
 #   * When the device runs out of GC memory, it requests an interrupt
 #     to mark and sweep.
 
-export @cuda_gc, gc_malloc, gc_collect
+export @cuda_gc, gc_malloc, gc_collect, gc_safepoint
 
 import Base: length
 
@@ -103,6 +103,9 @@ const ObjectRef = Ptr{Nothing}
 # A GC frame is just a pointer to an array of Julia objects.
 const GCFrame = Ptr{ObjectRef}
 
+# The type of a safepoint flag.
+const SafepointFlag = UInt32
+
 # A data structure that contains global GC info. This data
 # structure is designed to be immutable: it should not be changed
 # once the host has set it up.
@@ -122,7 +125,7 @@ struct GCMasterRecord
 
     # A pointer to a list of safepoint flags. Every warp has its
     # own flag.
-    safepoint_flags::Ptr{UInt8}
+    safepoint_flags::Ptr{SafepointFlag}
 
     # A pointer to a list of root buffer pointers that point to the
     # end of the root buffer for every thread.
@@ -210,6 +213,23 @@ Deregisters a GC frame.
     return
 end
 
+"""
+    gc_safepoint()
+
+Signals that this warp has reached a GC safepoint.
+"""
+@inline function gc_safepoint()
+    master_record = get_gc_master_record()
+    warp_id = div(get_thread_id() - 1, master_record.warp_count) + 1
+    safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointFlag) * warp_id
+
+    wait_for_interrupt() do
+        volatile_store!(safepoint_flag_ptr, SafepointFlag(1))
+    end
+
+    return
+end
+
 const gc_align = Csize_t(16)
 
 # Aligns a pointer to an alignment boundary.
@@ -502,8 +522,8 @@ function gc_init!(
     gc_memory_end_ptr = master_region.start + master_region.size
 
     # Set up the safepoint flag buffer.
-    safepoint_bytesize = sizeof(UInt8) * warp_count
-    safepoint_ptr = Base.unsafe_convert(Ptr{UInt8}, gc_memory_start_ptr)
+    safepoint_bytesize = sizeof(SafepointFlag) * warp_count
+    safepoint_ptr = Base.unsafe_convert(Ptr{SafepointFlag}, gc_memory_start_ptr)
 
     # Set up root buffers.
     fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
diff --git a/src/interrupts.jl b/src/interrupts.jl
index 7793b42d..83fe13d5 100644
--- a/src/interrupts.jl
+++ b/src/interrupts.jl
@@ -141,6 +141,20 @@ function interrupt_or_wait()::Bool
     return prev_state == ready
 end
 
+"""
+    wait_for_interrupt(fun::Function)
+
+Waits for the current interrupt to finish, if an interrupt is
+currently running. A function is repeatedly executed until the
+interrupt finishes.
+"""
+function wait_for_interrupt(fun::Function)
+    state_ptr = get_interrupt_pointer()
+    while volatile_load(state_ptr) == processing
+        fun()
+    end
+end
+
 """
     wait_for_interrupt()
 
@@ -148,8 +162,7 @@ Waits for the current interrupt to finish, if an interrupt is
 currently running.
 """
 function wait_for_interrupt()
-    state_ptr = get_interrupt_pointer()
-    while volatile_load(state_ptr) == processing
+    wait_for_interrupt() do
     end
 end
 

From a76a568aebb021d259678352a769e5936cdc9abe Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 12:45:16 +0100
Subject: [PATCH 049/146] Put safepoint flag values in an enum

---
 src/gc.jl | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index bd991a3b..a523e371 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -104,7 +104,18 @@ const ObjectRef = Ptr{Nothing}
 const GCFrame = Ptr{ObjectRef}
 
 # The type of a safepoint flag.
-const SafepointFlag = UInt32
+@enum SafepointFlag::UInt32 begin
+    # Indicates that a warp is not in a safepoint.
+    not_in_safepoint = 0
+    # Indicates that a warp is in a safepoint. This
+    # flag will be reset to `not_in_safepoint` by the
+    # collector on the next collecotr.
+    in_safepoint = 1
+    # Indicates that a warp is in a perma-safepoint:
+    # the collector will not try to set this type
+    # of safepoint back to `not_in_safepoint`.
+    in_perma_safepoint = 2
+end
 
 # A data structure that contains global GC info. This data
 # structure is designed to be immutable: it should not be changed
@@ -224,7 +235,7 @@ Signals that this warp has reached a GC safepoint.
     safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointFlag) * warp_id
 
     wait_for_interrupt() do
-        volatile_store!(safepoint_flag_ptr, SafepointFlag(1))
+        volatile_store!(safepoint_flag_ptr, in_safepoint)
     end
 
     return

From 23e128c2e30275ac71a6d871f025f71d362d9f13 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 14:08:12 +0100
Subject: [PATCH 050/146] Implement stop-the-world part of the GC

---
 src/gc.jl | 68 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 54 insertions(+), 14 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index a523e371..a3e7d4b6 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -103,8 +103,8 @@ const ObjectRef = Ptr{Nothing}
 # A GC frame is just a pointer to an array of Julia objects.
 const GCFrame = Ptr{ObjectRef}
 
-# The type of a safepoint flag.
-@enum SafepointFlag::UInt32 begin
+# The states a safepoint flag can have.
+@enum SafepointState::UInt32 begin
     # Indicates that a warp is not in a safepoint.
     not_in_safepoint = 0
     # Indicates that a warp is in a safepoint. This
@@ -136,7 +136,7 @@ struct GCMasterRecord
 
     # A pointer to a list of safepoint flags. Every warp has its
     # own flag.
-    safepoint_flags::Ptr{SafepointFlag}
+    safepoint_flags::Ptr{SafepointState}
 
     # A pointer to a list of root buffer pointers that point to the
     # end of the root buffer for every thread.
@@ -181,6 +181,11 @@ end
     return (blockIdx().x - 1) * blockDim().x + threadIdx().x
 end
 
+# Gets the warp ID of the current thread.
+@inline function get_warp_id()
+    return div(get_thread_id() - 1, warpsize()) + 1
+end
+
 """
     new_gc_frame(size::UInt32)::GCFrame
 
@@ -229,18 +234,34 @@ end
 
 Signals that this warp has reached a GC safepoint.
 """
-@inline function gc_safepoint()
-    master_record = get_gc_master_record()
-    warp_id = div(get_thread_id() - 1, master_record.warp_count) + 1
-    safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointFlag) * warp_id
-
+function gc_safepoint()
     wait_for_interrupt() do
-        volatile_store!(safepoint_flag_ptr, in_safepoint)
+        gc_set_safepoint_flag(in_safepoint)
     end
+    return
+end
 
+# Sets this warp's safepoint flag to a particular state.
+function gc_set_safepoint_flag(value::SafepointState)
+    master_record = get_gc_master_record()
+    warp_id = get_warp_id()
+    safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointState) * (warp_id - 1)
+    volatile_store!(safepoint_flag_ptr, value)
     return
 end
 
+# Marks a region as a perma-safepoint: the entire region
+# is a safepoint. Note that perma-safepoints are not allowed
+# to include non-perma-safepoints.
+macro perma_safepoint(expr)
+    quote
+        gc_set_safepoint_flag(in_perma_safepoint)
+        local result = $(esc(expr))
+        gc_set_safepoint_flag(not_in_safepoint)
+        result
+    end
+end
+
 const gc_align = Csize_t(16)
 
 # Aligns a pointer to an alignment boundary.
@@ -390,14 +411,14 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     master_record = get_gc_master_record()
 
     # Try to malloc the object without host intervention.
-    ptr = @nocollect gc_malloc_local(master_record.global_arena, bytesize)
+    ptr = @perma_safepoint @nocollect gc_malloc_local(master_record.global_arena, bytesize)
     if ptr != C_NULL
         return ptr
     end
 
     # We're out of memory, which means that we need the garbage collector
-    # to step in. Acquire the interrupt lock.
-    ptr = writer_locked(get_interrupt_lock()) do
+    # to step in. Set a perma-safepoint and acquire the interrupt lock.
+    ptr = @perma_safepoint writer_locked(get_interrupt_lock()) do
         # Try to allocate memory again. This is bound to fail for the
         # first thread that acquires the interrupt lock, but it is quite
         # likely to succeed if we are *not* in the first thread that
@@ -533,8 +554,8 @@ function gc_init!(
     gc_memory_end_ptr = master_region.start + master_region.size
 
     # Set up the safepoint flag buffer.
-    safepoint_bytesize = sizeof(SafepointFlag) * warp_count
-    safepoint_ptr = Base.unsafe_convert(Ptr{SafepointFlag}, gc_memory_start_ptr)
+    safepoint_bytesize = sizeof(SafepointState) * warp_count
+    safepoint_ptr = Base.unsafe_convert(Ptr{SafepointState}, gc_memory_start_ptr)
 
     # Set up root buffers.
     fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
@@ -743,6 +764,25 @@ end
 # Collects garbage. This function is designed to be called by the host,
 # not by the device.
 function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
+    # First off, we have to wait for all warps to reach a safepoint. Clear
+    # safepoint flags and wait for warps to set them again.
+    for i in 0:(master_record.warp_count - 1)
+        atomic_compare_exchange!(
+            master_record.safepoint_flags + i * sizeof(SafepointState),
+            in_safepoint,
+            not_in_safepoint)
+    end
+    safepoint_count = 0
+    while safepoint_count != master_record.warp_count
+        safepoint_count = 0
+        for i in 0:(master_record.warp_count - 1)
+            state = volatile_load(master_record.safepoint_flags + i * sizeof(SafepointState))
+            if state != not_in_safepoint
+                safepoint_count += 1
+            end
+        end
+    end
+
     # The Julia CPU GC is precise and the information it uses for precise
     # garbage collection is stored in memory that we should be able to access.
     # However, the way the CPU GC stores field information is incredibly

From 0875425b9113e590cbfc40b28ab06e12feabf630 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 16:00:34 +0100
Subject: [PATCH 051/146] Automatically insert safepoints

---
 src/compiler/optim.jl | 61 +++++++++++++++++++++++++++++++++++++++++++
 src/device/runtime.jl |  3 +++
 2 files changed, 64 insertions(+)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index f798f96e..f29a2a0c 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -71,6 +71,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
         initialize!(pm)
         # lower intrinsics
         if ctx.gc
+            add!(pm, FunctionPass("InsertSafepointsGPUGC", insert_safepoints_gpugc!))
             add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!))
         else
             add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!))
@@ -586,6 +587,66 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
     return changed
 end
 
+# Tells if a function manages a GC frame.
+function has_gc_frame(fun::LLVM.Function)
+    for insn in instructions(entry(fun))
+        if isa(insn, LLVM.CallInst)
+            callee = called_value(insn)
+            if isa(callee, LLVM.Function) && LLVM.name(callee) == "julia.new_gc_frame"
+                return true
+            end
+        end
+    end
+    return false
+end
+
+# Tells if an instruction is a call to a non-intrinsic callee.
+function is_non_intrinsic_call(instruction::LLVM.Instruction)
+    if isa(instruction, LLVM.CallInst)
+        callee = called_value(instruction)
+        if isa(callee, LLVM.Function)
+            callee_name = LLVM.name(callee)
+            return !startswith(callee_name, "julia.") && !startswith(callee_name, "llvm.")
+        else
+            return true
+        end
+    else
+        return false
+    end
+end
+
+"""
+    insert_safepoints_gpugc!(fun::LLVM.Function)
+
+An LLVM pass that inserts GC safepoints in such a way that threads
+reach a safepoint after a reasonable amount of time.
+"""
+function insert_safepoints_gpugc!(fun::LLVM.Function)
+    # Insert a safepoint before every function call, but only for
+    # functions that manage a GC frame.
+    #
+    # TODO: also insert safepoints on loop back-edges? This is what people
+    # usually do, but it requires nontrivial IR analyses that the LLVM C
+    # API doesn't expose.
+
+    if has_gc_frame(fun)
+        let builder = Builder(JuliaContext())
+            for block in blocks(fun)
+                for instruction in instructions(block)
+                    if is_non_intrinsic_call(instruction)
+                        # Insert a safepoint just before the call.
+                        position!(builder, instruction)
+                        debuglocation!(builder, instruction)
+                        call!(builder, Runtime.get(:gc_safepoint), LLVM.Value[])
+                    end
+                end
+            end
+            dispose(builder)
+        end
+    end
+    return true
+end
+
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
 #
 # this assumes and checks that the TLS is unused, which should be the case for most GPU code
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index e456c356..1c492369 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -261,4 +261,7 @@ compile(
     () -> convert(LLVMType, Cvoid),
     () -> [T_pprjlvalue()])
 
+# Also import the safepoint function.
+compile(CUDAnative.gc_safepoint, Cvoid, ())
+
 end

From 3ad1ee8d07409c9b0cea92e9c40550542b7d0e14 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 8 Mar 2019 16:03:41 +0100
Subject: [PATCH 052/146] Update GC example

---
 examples/{gc-malloc.jl => gc.jl} | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)
 rename examples/{gc-malloc.jl => gc.jl} (67%)

diff --git a/examples/gc-malloc.jl b/examples/gc.jl
similarity index 67%
rename from examples/gc-malloc.jl
rename to examples/gc.jl
index 597ed2ae..211a2fb4 100644
--- a/examples/gc-malloc.jl
+++ b/examples/gc.jl
@@ -1,18 +1,30 @@
-using CUDAdrv, CUDAnative
+using CUDAdrv, CUDAnative, LLVM
+using InteractiveUtils
 using Test
 
+mutable struct TempStruct
+    data::Float32
+end
+
+@noinline function escape(val)
+    Base.pointer_from_objref(val)
+end
+
 # Define a kernel that copies values using a temporary buffer.
 function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-    buffer = Base.unsafe_convert(Ptr{Float32}, gc_malloc(sizeof(Float32) * Csize_t(16)))
 
-    unsafe_store!(buffer, unsafe_load(a, i), i % 13)
-    unsafe_store!(b, unsafe_load(buffer, i % 13), i)
+    for j in 1:256
+        # Allocate a mutable struct and make sure it ends up on the GC heap.
+        temp = TempStruct(unsafe_load(a, i))
+        escape(temp)
+        unsafe_store!(b, temp.data, i)
+    end
 
     return
 end
 
-thread_count = 64
+thread_count = 256
 
 # Allocate two arrays.
 source_array = Mem.alloc(Float32, thread_count)

From a0fbee87c515ad871d64a5e594e2e7681b8051e1 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 11 Mar 2019 15:18:27 +0100
Subject: [PATCH 053/146] Add a binary search tree example

---
 examples/binary-tree.jl | 158 ++++++++++++++++++++++++++++++++++++++++
 examples/gc.jl          |   3 +-
 2 files changed, 159 insertions(+), 2 deletions(-)
 create mode 100644 examples/binary-tree.jl

diff --git a/examples/binary-tree.jl b/examples/binary-tree.jl
new file mode 100644
index 00000000..5fb0c19a
--- /dev/null
+++ b/examples/binary-tree.jl
@@ -0,0 +1,158 @@
+using CUDAdrv, CUDAnative
+using Random, Test
+import Base: haskey, insert!
+
+# This example defines a kernel that constructs a binary search
+# tree for a set of numbers and then proceeds to test membership
+# in that tree for a sequence of other numbers.
+#
+# The main point of this example is to demonstrate that even
+# naive, pointer-chasing programs can be compiled to GPU kernels.
+
+"""A binary search tree node."""
+abstract type BinarySearchTreeNode{T} end
+
+"""An internal node of a binary search tree."""
+mutable struct InternalNode{T} <: BinarySearchTreeNode{T}
+    value::T
+    left::BinarySearchTreeNode{T}
+    right::BinarySearchTreeNode{T}
+end
+
+InternalNode{T}(value::T) where T = InternalNode{T}(value, LeafNode{T}(), LeafNode{T}())
+
+"""A leaf node of a binary search tree."""
+mutable struct LeafNode{T} <: BinarySearchTreeNode{T} end
+
+"""A binary search tree data structure."""
+mutable struct BinarySearchTree{T}
+    root::BinarySearchTreeNode{T}
+end
+
+"""Creates an empty binary search tree."""
+BinarySearchTree{T}() where T = BinarySearchTree{T}(LeafNode{T}())
+
+"""Tells if a binary search tree contains a particular element."""
+function haskey(tree::BinarySearchTree{T}, value::T)::Bool where T
+    walk = tree.root
+    while isa(walk, InternalNode{T})
+        if walk.value == value
+            return true
+        elseif walk.value > value
+            walk = walk.right
+        else
+            walk = walk.left
+        end
+    end
+    return false
+end
+
+"""Inserts an element into a binary search tree."""
+function insert!(tree::BinarySearchTree{T}, value::T) where T
+    if !isa(tree.root, InternalNode{T})
+        tree.root = InternalNode{T}(value)
+        return
+    end
+
+    walk = tree.root::InternalNode{T}
+    while true
+        if walk.value == value
+            return
+        elseif walk.value > value
+            right = walk.right
+            if isa(right, InternalNode{T})
+                walk = right
+            else
+                walk.right = InternalNode{T}(value)
+                return
+            end
+        else
+            left = walk.left
+            if isa(left, InternalNode{T})
+                walk = left
+            else
+                walk.left = InternalNode{T}(value)
+                return
+            end
+        end
+    end
+end
+
+"""
+Creates a binary search tree that contains elements copied from a device array.
+"""
+function BinarySearchTree{T}(elements::CUDAnative.DevicePtr{T}, size::Integer) where T
+    tree = BinarySearchTree{T}()
+    for i in 1:size
+        insert!(tree, unsafe_load(elements, i))
+    end
+    tree
+end
+
+"""
+Creates a binary search tree that contains elements copied from an array.
+"""
+function BinarySearchTree{T}(elements::Array{T}) where T
+    tree = BinarySearchTree{T}()
+    for i in 1:length(elements)
+        insert!(tree, elements[i])
+    end
+    tree
+end
+
+# Gets a sequence of Fibonacci numbers.
+function fibonacci(::Type{T}, count::Integer)::Array{T} where T
+    if count == 0
+        return []
+    elseif count == 1
+        return [one(T)]
+    end
+
+    results = [one(T), one(T)]
+    for i in 1:(count - 2)
+        push!(results, results[length(results) - 1] + results[length(results)])
+    end
+    return results
+end
+
+const number_count = 2000
+const thread_count = 32
+const tests_per_thread = 2000
+
+# Define a kernel that copies values using a temporary buffer.
+function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    tree = BinarySearchTree{Int64}(a, number_count)
+
+    for j in 1:tests_per_thread
+        offset = (i - 1) * tests_per_thread
+        index = offset + j
+        unsafe_store!(b, haskey(tree, unsafe_load(b, index)), index)
+    end
+
+    return
+end
+
+# Generate a sequence of 64-bit truncated Fibonacci numbers.
+number_set = fibonacci(Int64, number_count)
+# Randomize the sequence's order.
+shuffle!(number_set)
+
+# Generate numbers for which we will test membership in the sequence.
+test_sequence = Array(1:(thread_count * tests_per_thread))
+
+# Allocate two arrays.
+source_array = Mem.alloc(Int64, length(number_set))
+destination_array = Mem.alloc(Int64, length(test_sequence))
+source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+# Fill the source and destination arrays.
+Mem.upload!(source_array, number_set)
+Mem.upload!(destination_array, test_sequence)
+
+# Run the kernel.
+@cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+
+@test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence])
diff --git a/examples/gc.jl b/examples/gc.jl
index 211a2fb4..38e2ff7e 100644
--- a/examples/gc.jl
+++ b/examples/gc.jl
@@ -1,5 +1,4 @@
-using CUDAdrv, CUDAnative, LLVM
-using InteractiveUtils
+using CUDAdrv, CUDAnative
 using Test
 
 mutable struct TempStruct

From 3e2a8ff6c3c4f87e678cfc72004edec30905e62d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 11 Mar 2019 18:03:23 +0100
Subject: [PATCH 054/146] Use local arenas to reduce GC lock contention

---
 src/gc.jl | 134 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 106 insertions(+), 28 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index a3e7d4b6..e29b4349 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -121,9 +121,6 @@ end
 # structure is designed to be immutable: it should not be changed
 # once the host has set it up.
 struct GCMasterRecord
-    # A pointer to the global GC arena.
-    global_arena::Ptr{GCArenaRecord}
-
     # The number of warps.
     warp_count::UInt32
 
@@ -134,6 +131,15 @@ struct GCMasterRecord
     # of roots per thread.
     root_buffer_capacity::UInt32
 
+    # The number of local arenas.
+    local_arena_count::UInt32
+
+    # A pointer to a list of local GC arena pointers.
+    local_arenas::Ptr{Ptr{GCArenaRecord}}
+
+    # A pointer to the global GC arena.
+    global_arena::Ptr{GCArenaRecord}
+
     # A pointer to a list of safepoint flags. Every warp has its
     # own flag.
     safepoint_flags::Ptr{SafepointState}
@@ -149,6 +155,9 @@ end
 
 # Iterates through all arena pointers stored in a GC master record.
 @inline function iterate_arenas(fun::Function, master_record::GCMasterRecord)
+    for i in 1:master_record.local_arena_count
+        fun(unsafe_load(master_record.local_arenas, i))
+    end
     fun(master_record.global_arena)
 end
 
@@ -186,6 +195,19 @@ end
     return div(get_thread_id() - 1, warpsize()) + 1
 end
 
+# Gets a pointer to the local arena for this thread. This
+# pointer may be null if there are no local arenas.
+@inline function get_local_arena()::Ptr{GCArenaRecord}
+    master_record = get_gc_master_record()
+    if master_record.local_arena_count == UInt32(0)
+        return C_NULL
+    else
+        return unsafe_load(
+            master_record.local_arenas,
+            get_warp_id() % master_record.local_arena_count)
+    end
+end
+
 """
     new_gc_frame(size::UInt32)::GCFrame
 
@@ -411,7 +433,23 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     master_record = get_gc_master_record()
 
     # Try to malloc the object without host intervention.
-    ptr = @perma_safepoint @nocollect gc_malloc_local(master_record.global_arena, bytesize)
+    ptr = @perma_safepoint @nocollect begin
+        # Try to allocate in the local arena first. If that doesn't
+        # work, we'll move on to the global arena, which is bigger but
+        # is shared by all threads. (We want to minimize contention
+        # on the global arena's lock.)
+        local_arena = get_local_arena()
+        local_ptr = Base.unsafe_convert(Ptr{UInt8}, C_NULL)
+        if local_arena != C_NULL
+            local_ptr = gc_malloc_local(local_arena, bytesize)
+        end
+
+        if local_ptr == C_NULL
+            gc_malloc_local(master_record.global_arena, bytesize)
+        else
+            local_ptr
+        end
+    end
     if ptr != C_NULL
         return ptr
     end
@@ -423,6 +461,10 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
         # first thread that acquires the interrupt lock, but it is quite
         # likely to succeed if we are *not* in the first thread that
         # acquired the garbage collector lock.
+        #
+        # Note: don't try to allocate in the local arena first because
+        # we have already acquired a device-wide lock. Allocating in
+        # the local arena first might waste precious time.
         ptr2 = gc_malloc_local(master_record.global_arena, bytesize)
 
         if ptr2 == C_NULL
@@ -464,7 +506,7 @@ end
 # turn off collections. It can be called by the device, but in that
 # case it should be prefixed by the `@nocollect` macro followed by
 # a write lock acquisition on the arena's lock.
-function gc_free_local_impl(
+function gc_free_local(
     arena::Ptr{GCArenaRecord},
     record_ptr::Ptr{Ptr{GCAllocationRecord}})
 
@@ -501,21 +543,32 @@ function gc_collect()
     writer_locked(gc_collect_impl, get_interrupt_lock())
 end
 
+# One megabyte.
+const MiB = 1 << 20
+
 # The initial size of the GC heap, currently 16 MiB.
-const initial_gc_heap_size = 16 * (1 << 20)
+const initial_gc_heap_size = 16 * MiB
 
 # The default capacity of a root buffer, i.e., the max number of
 # roots that can be stored per thread. Currently set to
 # 256 roots. That's 2 KiB of roots per thread.
 const default_root_buffer_capacity = 256
 
-# The point at which an arena is deemed to be starving, i.e.,
+# The point at which the global arena is deemed to be starving, i.e.,
 # it no longer contains enough memory to perform basic allocations.
-# If an arena's free byte count stays below the arena starvation
+# If the global arena's free byte count stays below the arena starvation
 # threshold after a collection phase, the collector will allocate
 # additional memory to the arena such that it is no longer starving.
 # The arena starvation threshold is currently set to 4 MiB.
-const arena_starvation_threshold = 4 * (1 << 20)
+const global_arena_starvation_threshold = 4 * MiB
+
+# The point at which a local arena is deemed to be starving, i.e.,
+# it no longer contains enough memory to perform basic allocations.
+# If a local arena's free byte count stays below the arena starvation
+# threshold after a collection phase, the collector will allocate
+# additional memory to the arena such that it is no longer starving.
+# The arena starvation threshold is currently set to 1 MiB.
+const local_arena_starvation_threshold = 1 * MiB
 
 # A description of a region of memory that has been allocated to the GC heap.
 struct GCHeapRegion
@@ -542,7 +595,8 @@ function gc_init!(
     heap::GCHeapDescription,
     thread_count::Integer;
     warp_count::Union{Integer, Nothing} = nothing,
-    root_buffer_capacity::Integer = default_root_buffer_capacity)::GCMasterRecord
+    root_buffer_capacity::Integer = default_root_buffer_capacity,
+    local_arena_count::Integer = 8)::GCMasterRecord
 
     if warp_count == nothing
         warp_count = thread_count / CUDAdrv.warpsize(device())
@@ -553,11 +607,15 @@ function gc_init!(
     gc_memory_start_ptr = master_region.start
     gc_memory_end_ptr = master_region.start + master_region.size
 
-    # Set up the safepoint flag buffer.
+    # Allocate a local arena pointer buffer.
+    local_arenas_bytesize = sizeof(Ptr{GCArenaRecord}) * local_arena_count
+    local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{GCArenaRecord}}, gc_memory_start_ptr)
+
+    # Allocate the safepoint flag buffer.
     safepoint_bytesize = sizeof(SafepointState) * warp_count
-    safepoint_ptr = Base.unsafe_convert(Ptr{SafepointState}, gc_memory_start_ptr)
+    safepoint_ptr = Base.unsafe_convert(Ptr{SafepointState}, local_arenas_ptr + local_arenas_bytesize)
 
-    # Set up root buffers.
+    # Allocate root buffers.
     fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
     fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, safepoint_ptr + fingerbuf_bytesize)
     rootbuf_bytesize = sizeof(ObjectRef) * root_buffer_capacity * thread_count
@@ -568,25 +626,26 @@ function gc_init!(
         unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * root_buffer_capacity, i)
     end
 
-    # Compute a pointer to the start of the heap.
-    heap_start_ptr = rootbuf_ptr + rootbuf_bytesize
+    # Compute a pointer to the start of the first arena.
+    arena_start_ptr = rootbuf_ptr + rootbuf_bytesize
 
-    # Create a single free list entry.
-    first_entry_ptr = make_gc_block!(
-        heap_start_ptr + sizeof(GCArenaRecord),
-        Csize_t(gc_memory_end_ptr) - Csize_t(heap_start_ptr) - sizeof(GCArenaRecord))
+    # Set up local arenas.
+    for i in 1:local_arena_count
+        local_arena = make_gc_arena!(arena_start_ptr, Csize_t(local_arena_starvation_threshold))
+        unsafe_store!(local_arenas_ptr, local_arena, i)
+        arena_start_ptr += local_arena_starvation_threshold
+    end
 
-    # Set up the main GC data structure.
-    global_arena = Base.unsafe_convert(Ptr{GCArenaRecord}, heap_start_ptr)
-    unsafe_store!(
-        global_arena,
-        GCArenaRecord(0, first_entry_ptr, C_NULL))
+    # Set up the global arena.
+    global_arena = make_gc_arena!(arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
 
     return GCMasterRecord(
-        global_arena,
         UInt32(warp_count),
         UInt32(thread_count),
         root_buffer_capacity,
+        UInt32(local_arena_count),
+        local_arenas_ptr,
+        global_arena,
         safepoint_ptr,
         fingerbuf_ptr,
         rootbuf_ptr)
@@ -604,6 +663,19 @@ function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCAllocationRecor
     return entry
 end
 
+# Takes a zero-filled region of memory and turns it into an arena
+# managed by the GC, prefixed with an arena record.
+function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCArenaRecord} where T
+    # Create a single free list entry.
+    first_entry_ptr = make_gc_block!(start_ptr + sizeof(GCArenaRecord), size - sizeof(GCArenaRecord))
+
+    # Set up the arena record.
+    arena = Base.unsafe_convert(Ptr{GCArenaRecord}, start_ptr)
+    unsafe_store!(
+        arena,
+        GCArenaRecord(0, first_entry_ptr, C_NULL))
+end
+
 # Tells if a GC heap contains a particular pointer.
 function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T
     for region in heap.regions
@@ -854,7 +926,7 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
                 # We found a dead block. Release it. Don't proceed to the
                 # next block because the current block will change in the
                 # next iteration of this loop.
-                gc_free_local_impl(arena, record_ptr)
+                gc_free_local(arena, record_ptr)
             end
         end
 
@@ -864,8 +936,14 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
         # If the amount of free memory in the arena is below the starvation
         # limit then we'll expand the GC heap and add the additional memory
         # to the arena's free list.
-        if free_memory < arena_starvation_threshold
-            region = expand!(heap, arena_starvation_threshold)
+        threshold = if arena == master_record.global_arena
+            global_arena_starvation_threshold
+        else
+            local_arena_starvation_threshold
+        end
+
+        if free_memory < threshold
+            region = expand!(heap, threshold)
             extra_record = make_gc_block!(region.start, region.size)
             last_free_list_ptr = @get_field_pointer(arena, :free_list_head)
             iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record

From 48eb3f539b555e5974c28c4eeff4e833714ba5b7 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 11 Mar 2019 18:42:00 +0100
Subject: [PATCH 055/146] Automatically insert perma-safepoints

---
 src/compiler/optim.jl | 69 ++++++++++++++++++++++++++++++++++++++++---
 src/device/runtime.jl |  3 +-
 src/gc.jl             | 27 ++++++++++++++---
 3 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index f29a2a0c..cdd127de 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -11,7 +11,24 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
         add_library_info!(pm, triple(mod))
         add_transform_info!(pm, tm)
         if internalize
-            internalize!(pm, [LLVM.name(entry)])
+            # We want to internalize functions so we can optimize
+            # them, but we don't really want to internalize globals
+            # because doing so may cause multiple copies of the same
+            # globals to appear after linking together modules.
+            #
+            # For example, the runtime library includes GC-related globals.
+            # It is imperative that these globals are shared by all modules,
+            # but if they are internalized before they are linked then
+            # they will actually not be internalized.
+            #
+            # Also, don't internalize the entry point, for obvious reasons.
+            non_internalizable_names = [LLVM.name(entry)]
+            for val in globals(mod)
+                if isa(val, LLVM.GlobalVariable)
+                    push!(non_internalizable_names, LLVM.name(val))
+                end
+            end
+            internalize!(pm, non_internalizable_names)
         end
     end
 
@@ -71,7 +88,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
         initialize!(pm)
         # lower intrinsics
         if ctx.gc
-            add!(pm, FunctionPass("InsertSafepointsGPUGC", insert_safepoints_gpugc!))
+            add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry)))
             add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!))
         else
             add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!))
@@ -616,12 +633,16 @@ function is_non_intrinsic_call(instruction::LLVM.Instruction)
 end
 
 """
-    insert_safepoints_gpugc!(fun::LLVM.Function)
+    insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function)
 
 An LLVM pass that inserts GC safepoints in such a way that threads
 reach a safepoint after a reasonable amount of time.
+
+Moreover, this pass also inserts perma-safepoints after entry point returns.
+Perma-safepoints inform the GC that it doesn't need to wait for a warp to
+reach a safepoint; inserting them stops the GC from deadlocking.
 """
-function insert_safepoints_gpugc!(fun::LLVM.Function)
+function insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function)
     # Insert a safepoint before every function call, but only for
     # functions that manage a GC frame.
     #
@@ -644,6 +665,46 @@ function insert_safepoints_gpugc!(fun::LLVM.Function)
             dispose(builder)
         end
     end
+
+    # Insert perma-safepoints if necessary.
+    if fun == entry
+        # Looks like we're going to have to insert perma-safepoints.
+        # We need to keep in mind that perma-safepoints are per-warp,
+        # so we absolutely cannot allow warps to be in a divergent
+        # state when a perma-safepoint is set---all bets are off if
+        # that happens anyway.
+        #
+        # To make sure that we don't end up in that situation,
+        # we will create a dedicated return block and replace all 'ret'
+        # instructions by jumps to that return block.
+
+        # Create the dedicated return block.
+        return_block = BasicBlock(fun, "kernel_exit")
+        let builder = Builder(JuliaContext())
+            position!(builder, return_block)
+            call!(builder, Runtime.get(:gc_perma_safepoint), LLVM.Value[])
+            ret!(builder)
+            dispose(builder)
+        end
+
+        # Rewrite return instructions as branches to the return bloc.
+        for block in blocks(fun)
+            if block == return_block
+                # We need to be careful not to trick ourselves into
+                # turning the return block's 'ret' into an infinite loop.
+                continue
+            end
+            term = terminator(block)
+            if isa(term, LLVM.RetInst)
+                unsafe_delete!(block, term)
+                let builder = Builder(JuliaContext())
+                    position!(builder, block)
+                    br!(builder, return_block)
+                    dispose(builder)
+                end
+            end
+        end
+    end
     return true
 end
 
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 1c492369..27589633 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -261,7 +261,8 @@ compile(
     () -> convert(LLVMType, Cvoid),
     () -> [T_pprjlvalue()])
 
-# Also import the safepoint function.
+# Also import the safepoint and perma-safepoint functions.
 compile(CUDAnative.gc_safepoint, Cvoid, ())
+compile(CUDAnative.gc_perma_safepoint, Cvoid, ())
 
 end
diff --git a/src/gc.jl b/src/gc.jl
index e29b4349..60b375a9 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -263,6 +263,24 @@ function gc_safepoint()
     return
 end
 
+"""
+    gc_perma_safepoint()
+
+Signals that this warp has reached a GC perma-safepoint:
+the GC doesn't need to wait for this warp to reach a safepoint
+before starting collections. Instead, the GC may assume that
+the warp is already in a safepoint.
+
+Be careful with this function: all bets are off when this
+function is used improperly. For a more controlled (but still
+super dangerous) way to use perma-safepoints, see the
+`@perma_safepoint` macro.
+"""
+function gc_perma_safepoint()
+    gc_set_safepoint_flag(in_perma_safepoint)
+    return
+end
+
 # Sets this warp's safepoint flag to a particular state.
 function gc_set_safepoint_flag(value::SafepointState)
     master_record = get_gc_master_record()
@@ -277,7 +295,7 @@ end
 # to include non-perma-safepoints.
 macro perma_safepoint(expr)
     quote
-        gc_set_safepoint_flag(in_perma_safepoint)
+        gc_perma_safepoint()
         local result = $(esc(expr))
         gc_set_safepoint_flag(not_in_safepoint)
         result
@@ -972,9 +990,10 @@ end
 High-level interface for executing code on a GPU with GC support.
 The `@cuda_gc` macro should prefix a call, with `func` a callable function
 or object that should return nothing. It will be compiled to a CUDA function upon first
-use, and to a certain extent arguments will be converted and anaged automatically using
-`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel
-launch on the current CUDA context.
+use, and to a certain extent arguments will be converted and managed automatically using
+`cudaconvert`. Next, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel
+launch on the current CUDA context. Finally, `@cuda_gc` waits for the kernel to finish,
+performing garbage collection in the meantime if necessary.
 
 Several keyword arguments are supported that influence kernel compilation and execution. For
 more information, refer to the documentation of respectively [`cufunction`](@ref) and

From 4a634e40ed5ea957c12e291ff28234d66357ac5a Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 11 Mar 2019 18:57:46 +0100
Subject: [PATCH 056/146] Add a comprehensive GC test

---
 test/device/gc.jl | 70 +++++++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl  |  1 +
 2 files changed, 71 insertions(+)
 create mode 100644 test/device/gc.jl

diff --git a/test/device/gc.jl b/test/device/gc.jl
new file mode 100644
index 00000000..1ec9b0fc
--- /dev/null
+++ b/test/device/gc.jl
@@ -0,0 +1,70 @@
+@testset "gc" begin
+
+############################################################################################
+
+dummy() = return
+
+dummy_handler(kernel) = return
+
+@testset "@cuda_gc" begin
+
+@testset "allocate and collect" begin
+    # This test allocates many very small and very large objects. Both the small
+    # and large objects become garbage eventually, but small objects need to
+    # outlive the large objects (and not be collected erroneously) for the test
+    # to pass. So essentially this test tackles three things:
+    #
+    #   1. Allocation works.
+    #   2. Collection works.
+    #   3. Collection isn't gung-ho to the point of incorrectness.
+    #
+
+    mutable struct TempStruct
+        data::Float32
+    end
+
+    @noinline function escape(val)
+        Base.pointer_from_objref(val)
+    end
+
+    # Define a kernel that copies values using a temporary struct.
+    function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
+        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+        for j in 1:2
+            # Allocate a mutable struct and make sure it ends up on the GC heap.
+            temp = TempStruct(unsafe_load(a, i))
+            escape(temp)
+
+            # Allocate a large garbage buffer to force collections.
+            gc_malloc(Csize_t(256 * 1024))
+
+            # Use the mutable struct. If its memory has been reclaimed (by accident)
+            # then we expect the test at the end of this file to fail.
+            unsafe_store!(b, temp.data, i)
+        end
+
+        return
+    end
+
+    thread_count = 64
+
+    # Allocate two arrays.
+    source_array = Mem.alloc(Float32, thread_count)
+    destination_array = Mem.alloc(Float32, thread_count)
+    source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
+    destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
+
+    # Fill the source and destination arrays.
+    Mem.upload!(source_array, fill(42.f0, thread_count))
+    Mem.upload!(destination_array, zeros(Float32, thread_count))
+
+    # Run the kernel.
+    @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+
+    @test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count)
+end
+
+end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 05e1687f..6cac0eb5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -71,6 +71,7 @@ if CUDAnative.configured
             include("device/cuda.jl")
             include("device/intrinsics.jl")
             include("device/threading.jl")
+            include("device/gc.jl")
 
             #include("examples.jl")
         end

From d1ce8c7f57e10b559ac5ebee953e0594b12927df Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 15 Mar 2019 15:03:06 +0100
Subject: [PATCH 057/146] Do not serialize warps for reader locks

---
 src/device/threading.jl | 38 ++++++++++++++++++--------------------
 src/gc.jl               |  2 +-
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/src/device/threading.jl b/src/device/threading.jl
index 8bbeadf9..8723ebe4 100644
--- a/src/device/threading.jl
+++ b/src/device/threading.jl
@@ -107,28 +107,26 @@ Acquires a reader-writer lock in reader mode, runs `func` while the lock is
 acquired and releases the lock again.
 """
 function reader_locked(func::Function, lock::ReaderWriterLock)
-    warp_serialized() do
-        while true
-            # Increment the reader count. If the lock is in write-acquired mode,
-            # then the lock will stay in that mode (unless the reader count is
-            # exceeded, but that is virtually impossible). Otherwise, the lock
-            # will end up in read-acquired mode.
-            previous_state = atomic_add!(lock.state_ptr, 1)
-
-            # If the lock was in the idle or read-acquired state, then
-            # it is now in read-acquired mode.
-            if previous_state >= 0
-                # Run the function.
-                result = func()
-                # Decrement the reader count to release the reader lock.
-                atomic_add!(lock.state_ptr, -1)
-                # We're done here.
-                return result
-            end
-
-            # Decrement the reader count and try again.
+    while true
+        # Increment the reader count. If the lock is in write-acquired mode,
+        # then the lock will stay in that mode (unless the reader count is
+        # exceeded, but that is virtually impossible). Otherwise, the lock
+        # will end up in read-acquired mode.
+        previous_state = atomic_add!(lock.state_ptr, 1)
+
+        # If the lock was in the idle or read-acquired state, then
+        # it is now in read-acquired mode.
+        if previous_state >= 0
+            # Run the function.
+            result = func()
+            # Decrement the reader count to release the reader lock.
             atomic_add!(lock.state_ptr, -1)
+            # We're done here.
+            return result
         end
+
+        # Decrement the reader count and try again.
+        atomic_add!(lock.state_ptr, -1)
     end
 end
 
diff --git a/src/gc.jl b/src/gc.jl
index 60b375a9..820bab11 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -204,7 +204,7 @@ end
     else
         return unsafe_load(
             master_record.local_arenas,
-            get_warp_id() % master_record.local_arena_count)
+            get_thread_id() % master_record.local_arena_count)
     end
 end
 

From 513168118b3903c755b1a651431ef5bef1ffd124 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 15 Mar 2019 18:25:19 +0100
Subject: [PATCH 058/146] Define a GPU Mutex type

---
 examples/gc.jl          |  12 +++-
 examples/lock.jl        |  25 ++++++---
 src/device/threading.jl | 119 +++++++++++++++++++++++++++++++++++++---
 src/gc.jl               |  10 ----
 4 files changed, 137 insertions(+), 29 deletions(-)

diff --git a/examples/gc.jl b/examples/gc.jl
index 38e2ff7e..51fe758e 100644
--- a/examples/gc.jl
+++ b/examples/gc.jl
@@ -9,14 +9,20 @@ end
     Base.pointer_from_objref(val)
 end
 
-# Define a kernel that copies values using a temporary buffer.
+# Define a kernel that copies values using a temporary struct.
 function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
-    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
 
-    for j in 1:256
+    for j in 1:2
         # Allocate a mutable struct and make sure it ends up on the GC heap.
         temp = TempStruct(unsafe_load(a, i))
         escape(temp)
+
+        # Allocate a large garbage buffer to force collections.
+        gc_malloc(Csize_t(256 * 1024))
+
+        # Use the mutable struct. If its memory has been reclaimed (by accident)
+        # then we expect the test at the end of this file to fail.
         unsafe_store!(b, temp.data, i)
     end
 
diff --git a/examples/lock.jl b/examples/lock.jl
index b4269a7b..1e06efdb 100644
--- a/examples/lock.jl
+++ b/examples/lock.jl
@@ -1,13 +1,20 @@
 using CUDAdrv, CUDAnative
 using Test
 
-thread_count = 128
+const thread_count = Int32(128)
+const total_count = Int32(1024)
 
 # Define a kernel that atomically increments a counter using a lock.
-function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState})
-    lock = ReaderWriterLock(lock_state)
-    writer_locked(lock) do
-        unsafe_store!(counter, unsafe_load(counter) + 1)
+function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.MutexState})
+    lock = Mutex(lock_state)
+    done = false
+    while !done && try_lock(lock)
+        new_count = unsafe_load(counter) + 1
+        unsafe_store!(counter, new_count)
+        if new_count == total_count
+            done = true
+        end
+        CUDAnative.unlock(lock)
     end
     return
 end
@@ -17,9 +24,9 @@ counter_buf = Mem.alloc(sizeof(Int32))
 Mem.upload!(counter_buf, [Int32(0)])
 counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf)
 
-lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState))
-Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)])
-lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf)
+lock_buf = Mem.alloc(sizeof(CUDAnative.MutexState))
+Mem.upload!(lock_buf, [CUDAnative.MutexState(0)])
+lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.MutexState}, lock_buf)
 
 # @device_code_warntype increment_counter(counter_pointer, lock_pointer)
 
@@ -28,4 +35,4 @@ lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock
 
 # Check that the counter's final value equals the number
 # of threads.
-@test Mem.download(Int32, counter_buf) == [Int32(thread_count)]
+@test Mem.download(Int32, counter_buf) == [Int32(total_count)]
diff --git a/src/device/threading.jl b/src/device/threading.jl
index 8723ebe4..951c20e8 100644
--- a/src/device/threading.jl
+++ b/src/device/threading.jl
@@ -1,6 +1,6 @@
 # This file implements threading primitives that work for CUDAnative kernels.
 
-export ReaderWriterLock, reader_locked, writer_locked
+export ReaderWriterLock, reader_locked, writer_locked, Mutex, try_lock, unlock
 
 # Gets a pointer to a global with a particular name. If the global
 # does not exist yet, then it is declared in the global memory address
@@ -17,19 +17,35 @@ export ReaderWriterLock, reader_locked, writer_locked
     :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new))
 end
 
-# Atomically adds a value to a variable pointed to by a pointer.
-# Returns the previous value stored in that value.
-@generated function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T
+@generated function atomic_rmw!(::Val{op}, lhs::Ptr{T}, rhs::T)::T where {op, T}
     ptr_type = convert(LLVMType, Ptr{T})
     lt = string(convert(LLVMType, T))
     ir = """
         %ptr = inttoptr $ptr_type %0 to $lt*
-        %rv = atomicrmw volatile add $lt* %ptr, $lt %1 seq_cst
+        %rv = atomicrmw volatile $(String(op)) $lt* %ptr, $lt %1 seq_cst
         ret $lt %rv
         """
     :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T}, lhs, rhs))
 end
 
+# Atomically adds a value to a variable pointed to by a pointer.
+# Returns the previous value stored in that variable.
+function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T
+    atomic_rmw!(Val(:add), lhs, rhs)
+end
+
+# Atomically computes the logical or of a value and a variable pointed
+# to by a pointer. Returns the previous value stored in that variable.
+function atomic_or!(lhs::Ptr{T}, rhs::T)::T where T
+    atomic_rmw!(Val(:or), lhs, rhs)
+end
+
+# Atomically assigns a new value to a variable pointed to by a pointer.
+# Returns the previous value stored in that variable.
+function atomic_exchange!(lhs::Ptr{T}, rhs::T)::T where T
+    atomic_rmw!(Val(:xchg), lhs, rhs)
+end
+
 # Loads a value from a pointer.
 @generated function volatile_load(ptr::Ptr{T})::T where T
     ptr_type = string(convert(LLVMType, Ptr{T}))
@@ -54,6 +70,10 @@ end
     :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value))
 end
 
+function unwrap_device_ptr(ptr::DevicePtr{T, A})::Ptr{T} where {T, A}
+    convert(Ptr{T}, convert(Csize_t, ptr))
+end
+
 const ReaderWriterLockState = Int64
 
 """
@@ -75,8 +95,8 @@ struct ReaderWriterLock
     state_ptr::Ptr{ReaderWriterLockState}
 end
 
-ReaderWriterLock(state_ptr::DevicePtr{ReaderWriterLockState}) = ReaderWriterLock(
-    convert(Ptr{ReaderWriterLockState}, convert(Csize_t, state_ptr)))
+ReaderWriterLock(state_ptr::DevicePtr{ReaderWriterLockState}) =
+    ReaderWriterLock(unwrap_device_ptr(state_ptr))
 
 const max_rw_lock_readers = (1 << (sizeof(ReaderWriterLockState) * 8 - 1))
 
@@ -155,3 +175,88 @@ function writer_locked(func::Function, lock::ReaderWriterLock)
         return result
     end
 end
+
+# Gets the thread ID of the current thread.
+@inline function get_thread_id()
+    return (blockIdx().x - 1) * blockDim().x + threadIdx().x
+end
+
+# Gets the warp ID of the current thread.
+@inline function get_warp_id()
+    return div(get_thread_id() - 1, warpsize()) + 1
+end
+
+const MutexState = UInt32
+
+"""
+A mutex: a lock that guarantees mutual exclusion.
+"""
+struct Mutex
+    # This GPU mutex implementation is based on
+    # Lock-based Synchronization for GPU Architectures
+    # by Yunlong Xu et al.
+    state_ptr::Ptr{MutexState}
+end
+
+Mutex(state_ptr::DevicePtr{MutexState}) = 
+    Mutex(unwrap_device_ptr(state_ptr))
+
+"""
+    unlock(mutex::Mutex)
+
+Unlocks a mutex.
+"""
+function unlock(mutex::Mutex)
+    threadfence()
+    tid = get_thread_id()
+    atomic_compare_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1), UInt32(0))
+    return
+end
+
+"""
+    try_lock(mutex::Mutex)::Bool
+
+Tries to acquire a lock on a mutex. Returns `true`
+if a lock was acquired successfully; otherwise, `false`.
+"""
+function try_lock(mutex::Mutex)::Bool
+    tid = UInt32(get_thread_id())
+    wsize = warpsize()
+    threadbit = UInt32(1) << (tid % wsize)
+
+    mask = vote_ballot(true)
+
+    bitset = @cuStaticSharedMem(UInt32, 128)
+    bitset_ptr = unwrap_device_ptr(pointer(bitset)) + sizeof(UInt32) * div(threadIdx().x - 1, wsize)
+    unsafe_store!(bitset_ptr, UInt32(0))
+
+    lock = atomic_or!(mutex.state_ptr, UInt32(1))
+    if lock & UInt32(1) == UInt32(0)
+        # The lock is free.
+        atomic_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1))
+    else
+        pre_owner = lock >> 1
+        if pre_owner != tid
+            if div(lock, wsize << 1) == div(tid, wsize) && pre_owner > tid && (((mask >> (pre_owner % wsize)) & UInt32(1)) == UInt32(1))
+                atomic_or!(bitset_ptr, UInt32(1 << (pre_owner % wsize)))
+                atomic_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1))
+                if (atomic_or!(mutex.state_ptr, UInt32(0)) >> 1) != tid
+                    # Stealing failed.
+                    atomic_or!(bitset_ptr, threadbit)
+                end
+            else
+                # Cannot steal.
+                atomic_or!(bitset_ptr, threadbit)
+            end
+        end
+    end
+
+    if (unsafe_load(bitset_ptr) & threadbit) == UInt32(0)
+        threadfence()
+        return true
+    else
+        atomic_compare_exchange!(mutex.state_ptr, (tid << 1) + UInt32(1), UInt32(0))
+        threadfence()
+        return false
+    end
+end
diff --git a/src/gc.jl b/src/gc.jl
index 820bab11..193d6111 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -185,16 +185,6 @@ end
     return unsafe_load(@cuda_global_ptr("gc_master_record", GCMasterRecord))
 end
 
-# Gets the thread ID of the current thread.
-@inline function get_thread_id()
-    return (blockIdx().x - 1) * blockDim().x + threadIdx().x
-end
-
-# Gets the warp ID of the current thread.
-@inline function get_warp_id()
-    return div(get_thread_id() - 1, warpsize()) + 1
-end
-
 # Gets a pointer to the local arena for this thread. This
 # pointer may be null if there are no local arenas.
 @inline function get_local_arena()::Ptr{GCArenaRecord}

From ce75ce85063bc3176b53449fe02b0644e07fe8ac Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Sun, 17 Mar 2019 18:23:59 +0100
Subject: [PATCH 059/146] Collect GC statistics

---
 src/gc.jl | 255 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 150 insertions(+), 105 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 193d6111..b3f01c10 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -45,7 +45,8 @@
 
 export @cuda_gc, gc_malloc, gc_collect, gc_safepoint
 
-import Base: length
+import Base: length, show
+import Printf: @sprintf
 
 # A data structure that precedes every chunk of memory that has been
 # allocated or put into the free list.
@@ -607,7 +608,7 @@ function gc_init!(
     local_arena_count::Integer = 8)::GCMasterRecord
 
     if warp_count == nothing
-        warp_count = thread_count / CUDAdrv.warpsize(device())
+        warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device()))
     end
 
     master_region = heap.regions[1]
@@ -648,7 +649,7 @@ function gc_init!(
     global_arena = make_gc_arena!(arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
 
     return GCMasterRecord(
-        UInt32(warp_count),
+        warp_count,
         UInt32(thread_count),
         root_buffer_capacity,
         UInt32(local_arena_count),
@@ -841,125 +842,165 @@ function gc_compact_free_list(arena::Ptr{GCArenaRecord})::Csize_t
     return sum(record -> unsafe_load(record).size, records)
 end
 
+"""A report of the GC's actions."""
+mutable struct GCReport
+    """The total wall-clock time of a kernel execution."""
+    elapsed_time::Float64
+
+    """The number of collections that were performed."""
+    collection_count::Int
+
+    """The total wall-clock time of all collections."""
+    collection_time::Float64
+
+    """The total amount of additional memory allocated to local pools."""
+    extra_local_memory::Csize_t
+
+    """The total amount of additional memory allocated to the global pool."""
+    extra_global_memory::Csize_t
+
+    GCReport() = new(0.0, 0, 0.0, Csize_t(0), Csize_t(0))
+end
+
+function show(io::IO, report::GCReport)
+    print(io, "[wall-clock time: $(@sprintf("%.4f", report.elapsed_time)) s; ")
+    print(io, "collections: $(report.collection_count); ")
+    collection_percentage = 100 * report.collection_time / report.elapsed_time
+    print(io, "total collection time: $(@sprintf("%.4f", report.collection_time)) s ($(@sprintf("%.2f", collection_percentage))%); ")
+    print(io, "extra local memory: $(div(report.extra_local_memory, MiB)) MiB; ")
+    print(io, "extra global memory: $(div(report.extra_global_memory, MiB)) MiB]")
+end
+
 # Collects garbage. This function is designed to be called by the host,
 # not by the device.
-function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription)
-    # First off, we have to wait for all warps to reach a safepoint. Clear
-    # safepoint flags and wait for warps to set them again.
-    for i in 0:(master_record.warp_count - 1)
-        atomic_compare_exchange!(
-            master_record.safepoint_flags + i * sizeof(SafepointState),
-            in_safepoint,
-            not_in_safepoint)
-    end
-    safepoint_count = 0
-    while safepoint_count != master_record.warp_count
-        safepoint_count = 0
+function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, report::GCReport)
+    collection_time = Base.@elapsed begin
+        # First off, we have to wait for all warps to reach a safepoint. Clear
+        # safepoint flags and wait for warps to set them again.
         for i in 0:(master_record.warp_count - 1)
-            state = volatile_load(master_record.safepoint_flags + i * sizeof(SafepointState))
-            if state != not_in_safepoint
-                safepoint_count += 1
+            atomic_compare_exchange!(
+                master_record.safepoint_flags + i * sizeof(SafepointState),
+                in_safepoint,
+                not_in_safepoint)
+        end
+        safepoint_count = 0
+        while safepoint_count != master_record.warp_count
+            safepoint_count = 0
+            for i in 0:(master_record.warp_count - 1)
+                state = volatile_load(master_record.safepoint_flags + i * sizeof(SafepointState))
+                if state != not_in_safepoint
+                    safepoint_count += 1
+                end
             end
         end
-    end
 
-    # The Julia CPU GC is precise and the information it uses for precise
-    # garbage collection is stored in memory that we should be able to access.
-    # However, the way the CPU GC stores field information is incredibly
-    # complicated and replicating that logic here would be a royal pain to
-    # implement and maintain. Ideally, the CPU GC would expose an interface that
-    # allows us to point to an object and ask the GC for all GC-tracked pointers
-    # it contains. Alas, no such luck: the CPU GC doesn't even have an internal
-    # function that does that. The CPU GC's logic for finding GC-tracked pointer
-    # fields is instead fused tightly with its 'mark' loop.
-    #
-    # To cope with this, we will simply implement a semi-conservative GC: we precisely
-    # scan the roots for pointers into the GC heap. We then recursively mark blocks
-    # that are pointed to by such pointers as live and conservatively scan them for
-    # more pointers.
-    #
-    # Our mark phase is fairly simple: we maintain a worklist of pointers that
-    # are live and may need to be processed, as well as a set of blocks that are
-    # live and have already been processed.
-    live_blocks = Set{Ptr{GCAllocationRecord}}()
-    live_worklist = Ptr{ObjectRef}[]
-
-    # Get a sorted allocation list, which will allow us to classify live pointers quickly.
-    alloc_list = sort_allocation_list(master_record)
-
-    # Add all roots to the worklist.
-    for i in 1:(master_record.root_buffer_capacity * master_record.thread_count)
-        root = unsafe_load(master_record.root_buffers, i)
-        if root != C_NULL
-            push!(live_worklist, root)
+        # The Julia CPU GC is precise and the information it uses for precise
+        # garbage collection is stored in memory that we should be able to access.
+        # However, the way the CPU GC stores field information is incredibly
+        # complicated and replicating that logic here would be a royal pain to
+        # implement and maintain. Ideally, the CPU GC would expose an interface that
+        # allows us to point to an object and ask the GC for all GC-tracked pointers
+        # it contains. Alas, no such luck: the CPU GC doesn't even have an internal
+        # function that does that. The CPU GC's logic for finding GC-tracked pointer
+        # fields is instead fused tightly with its 'mark' loop.
+        #
+        # To cope with this, we will simply implement a semi-conservative GC: we precisely
+        # scan the roots for pointers into the GC heap. We then recursively mark blocks
+        # that are pointed to by such pointers as live and conservatively scan them for
+        # more pointers.
+        #
+        # Our mark phase is fairly simple: we maintain a worklist of pointers that
+        # are live and may need to be processed, as well as a set of blocks that are
+        # live and have already been processed.
+        live_blocks = Set{Ptr{GCAllocationRecord}}()
+        live_worklist = Ptr{ObjectRef}[]
+
+        # Get a sorted allocation list, which will allow us to classify live pointers quickly.
+        alloc_list = sort_allocation_list(master_record)
+
+        # Add all roots to the worklist.
+        for i in 1:(master_record.root_buffer_capacity * master_record.thread_count)
+            root = unsafe_load(master_record.root_buffers, i)
+            if root != C_NULL
+                push!(live_worklist, root)
+            end
         end
-    end
 
-    # Now process all live pointers until we reach a fixpoint.
-    while !isempty(live_worklist)
-        # Pop a pointer from the worklist.
-        object_ref = pop!(live_worklist)
-        # Get the block for that pointer.
-        record = get_record(alloc_list, object_ref)
-        # Make sure that we haven't visited the block yet.
-        if record != C_NULL && !(record in live_blocks)
-            # Mark the block as live.
-            push!(live_blocks, record)
-            # Add all pointer-sized, aligned values to the live pointer worklist.
-            block_pointer = data_pointer(record)
-            block_size = unsafe_load(record).size
-            for i in 0:sizeof(ObjectRef):(block_size - 1)
-                push!(live_worklist, Base.unsafe_convert(ObjectRef, block_pointer + i))
+        # Now process all live pointers until we reach a fixpoint.
+        while !isempty(live_worklist)
+            # Pop a pointer from the worklist.
+            object_ref = pop!(live_worklist)
+            # Get the block for that pointer.
+            record = get_record(alloc_list, object_ref)
+            # Make sure that we haven't visited the block yet.
+            if record != C_NULL && !(record in live_blocks)
+                # Mark the block as live.
+                push!(live_blocks, record)
+                # Add all pointer-sized, aligned values to the live pointer worklist.
+                block_pointer = data_pointer(record)
+                block_size = unsafe_load(record).size
+                for i in 0:sizeof(ObjectRef):(block_size - 1)
+                    push!(live_worklist, Base.unsafe_convert(ObjectRef, block_pointer + i))
+                end
             end
         end
-    end
 
-    # We're done with the mark phase! Time to proceed to the sweep phase.
-    # The first thing we'll do is iterate through every arena's allocation list and
-    # free dead blocks. Next, we will compact and reorder free lists to combat
-    # fragmentation.
-    iterate_arenas(master_record) do arena
-        record_ptr = @get_field_pointer(arena, :allocation_list_head)
-        while true
-            record = unsafe_load(record_ptr)
-            if record == C_NULL
-                # We've reached the end of the list.
-                break
+        # We're done with the mark phase! Time to proceed to the sweep phase.
+        # The first thing we'll do is iterate through every arena's allocation list and
+        # free dead blocks. Next, we will compact and reorder free lists to combat
+        # fragmentation.
+        iterate_arenas(master_record) do arena
+            record_ptr = @get_field_pointer(arena, :allocation_list_head)
+            while true
+                record = unsafe_load(record_ptr)
+                if record == C_NULL
+                    # We've reached the end of the list.
+                    break
+                end
+
+                if record in live_blocks
+                    # We found a live block. Proceed to the next block.
+                    record_ptr = @get_field_pointer(record, :next)
+                else
+                    # We found a dead block. Release it. Don't proceed to the
+                    # next block because the current block will change in the
+                    # next iteration of this loop.
+                    gc_free_local(arena, record_ptr)
+                end
             end
 
-            if record in live_blocks
-                # We found a live block. Proceed to the next block.
-                record_ptr = @get_field_pointer(record, :next)
+            # Compact the free list.
+            free_memory = gc_compact_free_list(arena)
+
+            # If the amount of free memory in the arena is below the starvation
+            # limit then we'll expand the GC heap and add the additional memory
+            # to the arena's free list.
+            threshold = if arena == master_record.global_arena
+                global_arena_starvation_threshold
             else
-                # We found a dead block. Release it. Don't proceed to the
-                # next block because the current block will change in the
-                # next iteration of this loop.
-                gc_free_local(arena, record_ptr)
+                local_arena_starvation_threshold
             end
-        end
 
-        # Compact the free list.
-        free_memory = gc_compact_free_list(arena)
+            if free_memory < threshold
+                region = expand!(heap, threshold)
 
-        # If the amount of free memory in the arena is below the starvation
-        # limit then we'll expand the GC heap and add the additional memory
-        # to the arena's free list.
-        threshold = if arena == master_record.global_arena
-            global_arena_starvation_threshold
-        else
-            local_arena_starvation_threshold
-        end
+                if arena == master_record.global_arena
+                    report.extra_global_memory += Csize_t(threshold)
+                else
+                    report.extra_local_memory += Csize_t(threshold)
+                end
 
-        if free_memory < threshold
-            region = expand!(heap, threshold)
-            extra_record = make_gc_block!(region.start, region.size)
-            last_free_list_ptr = @get_field_pointer(arena, :free_list_head)
-            iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record
-                last_free_list_ptr = @get_field_pointer(record, :next)
+                extra_record = make_gc_block!(region.start, region.size)
+                last_free_list_ptr = @get_field_pointer(arena, :free_list_head)
+                iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record
+                    last_free_list_ptr = @get_field_pointer(record, :next)
+                end
+                unsafe_store!(last_free_list_ptr, extra_record)
             end
-            unsafe_store!(last_free_list_ptr, extra_record)
         end
     end
+    report.collection_count += 1
+    report.collection_time += collection_time
 end
 
 # Examines a keyword argument list and gets either the value
@@ -1056,8 +1097,9 @@ macro cuda_gc(ex...)
                     end
                 end
 
+                local gc_report = GCReport()
                 local function handle_interrupt()
-                    gc_collect_impl(master_record, gc_heap)
+                    gc_collect_impl(master_record, gc_heap, gc_report)
                 end
 
                 try
@@ -1066,14 +1108,17 @@ macro cuda_gc(ex...)
                     local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
                     local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, $(map(esc, compiler_kwargs)...))
                     CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...))
-                    kernel(kernel_args...; $(map(esc, call_kwargs)...))
+                    gc_report.elapsed_time = Base.@elapsed begin
+                        kernel(kernel_args...; $(map(esc, call_kwargs)...))
 
-                    # Handle interrupts.
-                    handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream)))
+                        # Handle interrupts.
+                        handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream)))
+                    end
                 finally
                     free_shared_array(host_interrupt_array)
                     free!(gc_heap)
                 end
+                gc_report
             end
          end)
     return code

From 8745e96ee4ebd2e0c96a052e44a35757da1218cb Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Sun, 17 Mar 2019 18:24:52 +0100
Subject: [PATCH 060/146] Add a matrix example

---
 examples/matrix.jl | 134 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 examples/matrix.jl

diff --git a/examples/matrix.jl b/examples/matrix.jl
new file mode 100644
index 00000000..a787171f
--- /dev/null
+++ b/examples/matrix.jl
@@ -0,0 +1,134 @@
+# This example has kernels allocate dense symmetric matrices, fill them with Fibonacci numbers
+# and compute their squares. The example is designed to stress the garbage allocator, specifically
+# testing its ability to deal with many large objects. Furthermore, the example requires multiple
+# collections to run to completion, so it also tests the performance of those collections.
+
+using StaticArrays, CUDAnative, CUDAdrv
+import Base: getindex, setindex!, pointer, unsafe_convert, zeros
+using InteractiveUtils
+
+const use_gc = true
+
+"""A fixed-size, heap-allocated array type for CUDAnative kernels."""
+struct FixedArray{T}
+    # The number of elements in the array.
+    size::Int
+
+    # A pointer to the first element in the array.
+    #
+    # TODO: maybe protect this pointer from the GC somehow?
+    # At the moment, this pointer is protected automatically
+    # because the GC is conservative rather than precise.
+    ptr::Ptr{T}
+end
+
+"""Allocates a heap-allocated array type and fills it with zeros."""
+function zeros(::Type{FixedArray{T}}, size::Int) where T
+    # Note: GC memory is always zero-initialized, so we don't
+    # actually have to fill the array with zeros.
+    bytesize = Csize_t(sizeof(T) * size)
+    buf = use_gc ? gc_malloc(bytesize) : CUDAnative.malloc(bytesize)
+    FixedArray{T}(size, unsafe_convert(Ptr{T}, buf))
+end
+
+"""Gets a pointer to the first element of a fixed-size array."""
+function pointer(array::FixedArray{T})::Ptr{T} where T
+    array.ptr
+end
+
+function getindex(array::FixedArray{T}, i::Integer)::T where T
+    # TODO: bounds checking.
+    unsafe_load(pointer(array), i)
+end
+
+function setindex!(array::FixedArray{T}, value::T, i::Integer) where T
+    # TODO: bounds checking.
+    unsafe_store!(pointer(array), value, i)
+end
+
+"""A heap-allocated matrix type, suitable for CUDAnative kernels."""
+struct Matrix{Width, Height, T}
+    data::FixedArray{T}
+end
+
+Matrix{Width, Height, T}() where {Width, Height, T} =
+    Matrix{Width, Height, T}(zeros(FixedArray{T}, Width * Height))
+
+function pointer(matrix::Matrix{Width, Height, T})::Ptr{T} where {Width, Height, T}
+    pointer(matrix.data)
+end
+
+function getindex(matrix::Matrix{Width, Height, T}, row::Int, column::Int) where {Width, Height, T}
+    getindex(matrix.data, (row - 1) * Width + column)
+end
+
+function setindex!(matrix::Matrix{Width, Height, T}, value::T, row::Int, column::Int) where {Width, Height, T}
+    setindex!(matrix.data, value, (row - 1) * Width + column)
+end
+
+const matrix_dim = 50
+const iterations = 20
+const thread_count = 256
+
+function kernel(result::CUDAnative.DevicePtr{Int64})
+    thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    accumulator = 0
+
+    for _ in 1:iterations
+        # Allocate a matrix.
+        matrix = Matrix{matrix_dim, matrix_dim, Int64}()
+
+        # Fill it with Fibonacci numbers.
+        penultimate = 0
+        ultimate = 1
+        for i in 1:matrix_dim
+            for j in 1:matrix_dim
+                matrix[i, j] = ultimate
+                tmp = ultimate
+                ultimate = ultimate + penultimate
+                penultimate = tmp
+            end
+        end
+
+        # Create a new element that contains the square of
+        # every element in `matrix`.
+        square = Matrix{matrix_dim, matrix_dim, Int64}()
+        for i in 1:matrix_dim
+            for j in 1:matrix_dim
+                square[i, j] = matrix[i, j] ^ 2
+            end
+        end
+
+        # Compute the sum of the squares.
+        square_sum = 0
+        for i in 1:matrix_dim
+            for j in 1:matrix_dim
+                square_sum += square[i, j]
+            end
+        end
+
+        # Add that sum to an accumulator.
+        accumulator += square_sum
+    end
+
+    # Write the accumulator to the result array.
+    unsafe_store!(result, accumulator, thread_id)
+
+    return
+end
+
+destination_array = Mem.alloc(Int64, thread_count)
+destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+if use_gc
+    time = @cuda_gc threads=thread_count kernel(destination_pointer)
+    println(time)
+    time = @cuda_gc threads=thread_count kernel(destination_pointer)
+    println(time)
+else
+    time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer)
+    println(time)
+    time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer)
+    println(time)
+end

From cc11f577b527bd0e7e032776eac3decc6d5487e3 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Sun, 17 Mar 2019 18:28:29 +0100
Subject: [PATCH 061/146] Amend binary tree example with a no-gc mode

---
 examples/binary-tree.jl | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/examples/binary-tree.jl b/examples/binary-tree.jl
index 5fb0c19a..46db7d38 100644
--- a/examples/binary-tree.jl
+++ b/examples/binary-tree.jl
@@ -9,6 +9,8 @@ import Base: haskey, insert!
 # The main point of this example is to demonstrate that even
 # naive, pointer-chasing programs can be compiled to GPU kernels.
 
+const use_gc = true
+
 """A binary search tree node."""
 abstract type BinarySearchTreeNode{T} end
 
@@ -115,8 +117,8 @@ function fibonacci(::Type{T}, count::Integer)::Array{T} where T
     return results
 end
 
-const number_count = 2000
-const thread_count = 32
+const number_count = 200
+const thread_count = 64
 const tests_per_thread = 2000
 
 # Define a kernel that copies values using a temporary buffer.
@@ -152,7 +154,21 @@ destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
 Mem.upload!(source_array, number_set)
 Mem.upload!(destination_array, test_sequence)
 
-# Run the kernel.
-@cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+if use_gc
+    # Run the kernel.
+    @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+
+    # Run it again.
+    Mem.upload!(destination_array, test_sequence)
+    stats = @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+else
+    # Run the kernel.
+    @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+
+    # Run it again and time it this time.
+    Mem.upload!(destination_array, test_sequence)
+    stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+end
+println(stats)
 
 @test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence])

From 2f340884c5c965b1c8006452eda4baff5db6e79f Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 18 Mar 2019 12:38:42 +0100
Subject: [PATCH 062/146] Measure GC polling times

---
 src/device/runtime.jl | 13 ++-----------
 src/gc.jl             | 25 ++++++++++++++++++++++---
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 27589633..6b7c792e 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -232,17 +232,8 @@ function T_pprjlvalue()
         LLVM.PointerType(eltype(T_pjlvalue), Tracked))
 end
 
-"""
-    gc_malloc_object(bytesize::Csize_t)
-
-Allocates an object that is managed by the garbage collector.
-This function is designed to be called by the device.
-"""
-function gc_malloc_object(bytesize::Csize_t)
-    return unsafe_pointer_to_objref(gc_malloc(bytesize))
-end
-
-compile(gc_malloc_object, Any, (Csize_t,), T_prjlvalue)
+# Include the GC memory allocation function into the runtime.
+compile(CUDAnative.gc_malloc_object, Any, (Csize_t,), T_prjlvalue)
 
 # Include GC frame management functions into the runtime.
 compile(CUDAnative.new_gc_frame, Any, (Cuint,), T_pprjlvalue)
diff --git a/src/gc.jl b/src/gc.jl
index b3f01c10..fb2008f6 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -43,7 +43,7 @@
 #   * When the device runs out of GC memory, it requests an interrupt
 #     to mark and sweep.
 
-export @cuda_gc, gc_malloc, gc_collect, gc_safepoint
+export @cuda_gc, gc_malloc, gc_malloc_object, gc_collect, gc_safepoint
 
 import Base: length, show
 import Printf: @sprintf
@@ -497,6 +497,16 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     return C_NULL
 end
 
+"""
+    gc_malloc_object(bytesize::Csize_t)
+
+Allocates an object that is managed by the garbage collector.
+This function is designed to be called by the device.
+"""
+function gc_malloc_object(bytesize::Csize_t)
+    unsafe_pointer_to_objref(gc_malloc(bytesize))
+end
+
 # Zero-fills a range of memory.
 function zero_fill!(start_ptr::Ptr{UInt8}, size::Csize_t)
     ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), start_ptr, 0, size)
@@ -850,6 +860,9 @@ mutable struct GCReport
     """The number of collections that were performed."""
     collection_count::Int
 
+    """The total wall-clock time of all collection polls."""
+    collection_poll_time::Float64
+
     """The total wall-clock time of all collections."""
     collection_time::Float64
 
@@ -859,12 +872,14 @@ mutable struct GCReport
     """The total amount of additional memory allocated to the global pool."""
     extra_global_memory::Csize_t
 
-    GCReport() = new(0.0, 0, 0.0, Csize_t(0), Csize_t(0))
+    GCReport() = new(0.0, 0, 0.0, 0.0, Csize_t(0), Csize_t(0))
 end
 
 function show(io::IO, report::GCReport)
     print(io, "[wall-clock time: $(@sprintf("%.4f", report.elapsed_time)) s; ")
     print(io, "collections: $(report.collection_count); ")
+    poll_percentage = 100 * report.collection_poll_time / report.elapsed_time
+    print(io, "total poll time: $(@sprintf("%.4f", report.collection_poll_time)) s ($(@sprintf("%.2f", poll_percentage))%); ")
     collection_percentage = 100 * report.collection_time / report.elapsed_time
     print(io, "total collection time: $(@sprintf("%.4f", report.collection_time)) s ($(@sprintf("%.2f", collection_percentage))%); ")
     print(io, "extra local memory: $(div(report.extra_local_memory, MiB)) MiB; ")
@@ -874,7 +889,7 @@ end
 # Collects garbage. This function is designed to be called by the host,
 # not by the device.
 function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, report::GCReport)
-    collection_time = Base.@elapsed begin
+    poll_time = Base.@elapsed begin
         # First off, we have to wait for all warps to reach a safepoint. Clear
         # safepoint flags and wait for warps to set them again.
         for i in 0:(master_record.warp_count - 1)
@@ -893,6 +908,9 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription,
                 end
             end
         end
+    end
+
+    collection_time = Base.@elapsed begin
 
         # The Julia CPU GC is precise and the information it uses for precise
         # garbage collection is stored in memory that we should be able to access.
@@ -1001,6 +1019,7 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription,
     end
     report.collection_count += 1
     report.collection_time += collection_time
+    report.collection_poll_time += poll_time
 end
 
 # Examines a keyword argument list and gets either the value

From cec2dcc54340175af0893ca4d3b1646bab8b0a67 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 18 Mar 2019 13:15:12 +0100
Subject: [PATCH 063/146] Rename GC free list data structures

---
 src/gc.jl | 84 +++++++++++++++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index fb2008f6..2893cd7e 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -50,7 +50,7 @@ import Printf: @sprintf
 
 # A data structure that precedes every chunk of memory that has been
 # allocated or put into the free list.
-struct GCAllocationRecord
+struct FreeListRecord
     # The size of the memory region this allocation record precedes.
     # This size does not include the allocation record itself.
     size::Csize_t
@@ -59,7 +59,7 @@ struct GCAllocationRecord
     # allocation record is part of the free list, then this pointer
     # points to the next free list entry; otherwise, it points to the
     # next entry in the list of allocated blocks.
-    next::Ptr{GCAllocationRecord}
+    next::Ptr{FreeListRecord}
 end
 
 @generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name}
@@ -75,27 +75,27 @@ macro get_field_pointer(base_pointer, field_name)
 end
 
 # Gets a pointer to the first byte of data managed by an allocation record.
-function data_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8}
-    Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(GCAllocationRecord)
+function data_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8}
+    Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(FreeListRecord)
 end
 
 # Gets a pointer to the first byte of data no longer managed by an allocation record.
-function data_end_pointer(record::Ptr{GCAllocationRecord})::Ptr{UInt8}
+function data_end_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8}
     data_pointer(record) + unsafe_load(@get_field_pointer(record, :size))
 end
 
 # A data structure that describes a single GC "arena", i.e.,
 # a section of the heap that is managed by the GC. Every arena
 # has its own free list and allocation list.
-struct GCArenaRecord
+struct FreeListArena
     # The allocation lock for the arena.
     lock_state::ReaderWriterLockState
 
     # The head of the free list.
-    free_list_head::Ptr{GCAllocationRecord}
+    free_list_head::Ptr{FreeListRecord}
 
     # The head of the allocation list.
-    allocation_list_head::Ptr{GCAllocationRecord}
+    allocation_list_head::Ptr{FreeListRecord}
 end
 
 # A reference to a Julia object.
@@ -136,10 +136,10 @@ struct GCMasterRecord
     local_arena_count::UInt32
 
     # A pointer to a list of local GC arena pointers.
-    local_arenas::Ptr{Ptr{GCArenaRecord}}
+    local_arenas::Ptr{Ptr{FreeListArena}}
 
     # A pointer to the global GC arena.
-    global_arena::Ptr{GCArenaRecord}
+    global_arena::Ptr{FreeListArena}
 
     # A pointer to a list of safepoint flags. Every warp has its
     # own flag.
@@ -188,7 +188,7 @@ end
 
 # Gets a pointer to the local arena for this thread. This
 # pointer may be null if there are no local arenas.
-@inline function get_local_arena()::Ptr{GCArenaRecord}
+@inline function get_local_arena()::Ptr{FreeListArena}
     master_record = get_gc_master_record()
     if master_record.local_arena_count == UInt32(0)
         return C_NULL
@@ -309,9 +309,9 @@ end
 # Tries to use a free-list entry to allocate a chunk of data of size `bytesize`.
 # Updates the free list if the allocation succeeds. Returns a null pointer otherwise.
 function gc_use_free_list_entry(
-    entry_ptr::Ptr{Ptr{GCAllocationRecord}},
-    allocation_list_ptr::Ptr{Ptr{GCAllocationRecord}},
-    entry::Ptr{GCAllocationRecord},
+    entry_ptr::Ptr{Ptr{FreeListRecord}},
+    allocation_list_ptr::Ptr{Ptr{FreeListRecord}},
+    entry::Ptr{FreeListRecord},
     bytesize::Csize_t,)::Ptr{UInt8}
 
     entry_data = unsafe_load(entry)
@@ -333,7 +333,7 @@ function gc_use_free_list_entry(
     # prefixed by the block needs to be aligned to a 16-byte boundary,
     # but the block itself doesn't.
     new_data_address = align_to_boundary(data_address + bytesize)
-    new_entry_address = new_data_address - sizeof(GCAllocationRecord)
+    new_entry_address = new_data_address - sizeof(FreeListRecord)
     if new_entry_address < data_address + bytesize
         new_entry_address += gc_align
         new_data_address += gc_align
@@ -344,10 +344,10 @@ function gc_use_free_list_entry(
     if new_data_address < end_address
         # Create a new free list entry.
         new_entry_size = Csize_t(end_address) - Csize_t(new_data_address)
-        new_entry_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, new_entry_address)
+        new_entry_ptr = Base.unsafe_convert(Ptr{FreeListRecord}, new_entry_address)
         unsafe_store!(
             new_entry_ptr,
-            GCAllocationRecord(new_entry_size, entry_data.next))
+            FreeListRecord(new_entry_size, entry_data.next))
 
         # Update this entry's `size` field to reflect the new entry's space
         # requirements.
@@ -369,7 +369,7 @@ function gc_use_free_list_entry(
 
     # Set the `next` pointer to the value stored at the allocation list pointer.
     unsafe_store!(
-        @get_field_pointer(entry, :next)::Ptr{Ptr{GCAllocationRecord}},
+        @get_field_pointer(entry, :next)::Ptr{Ptr{FreeListRecord}},
         unsafe_load(allocation_list_ptr))
 
     # Update the allocation list pointer to point to the entry.
@@ -387,8 +387,8 @@ end
 #
 # This function is not thread-safe.
 function gc_malloc_from_free_list(
-    free_list_ptr::Ptr{Ptr{GCAllocationRecord}},
-    allocation_list_ptr::Ptr{Ptr{GCAllocationRecord}},
+    free_list_ptr::Ptr{Ptr{FreeListRecord}},
+    allocation_list_ptr::Ptr{Ptr{FreeListRecord}},
     bytesize::Csize_t)::Ptr{UInt8}
     # To allocate memory, we will walk the free list until we find a suitable candidate.
     while free_list_ptr != C_NULL
@@ -403,7 +403,7 @@ function gc_malloc_from_free_list(
             return result
         end
 
-        free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{GCAllocationRecord}}
+        free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{FreeListRecord}}
     end
     return C_NULL
 end
@@ -411,13 +411,13 @@ end
 # Tries to allocate a chunk of memory in a particular GC arena.
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
-function gc_malloc_local(arena::Ptr{GCArenaRecord}, bytesize::Csize_t)::Ptr{UInt8}
+function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8}
     # Acquire the arena's lock.
     arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state))
     result_ptr = writer_locked(arena_lock) do
         # Allocate a suitable region of memory.
-        free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{GCAllocationRecord}}
-        allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{GCAllocationRecord}}
+        free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}}
+        allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}}
         gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
     end
 
@@ -526,8 +526,8 @@ end
 # case it should be prefixed by the `@nocollect` macro followed by
 # a write lock acquisition on the arena's lock.
 function gc_free_local(
-    arena::Ptr{GCArenaRecord},
-    record_ptr::Ptr{Ptr{GCAllocationRecord}})
+    arena::Ptr{FreeListArena},
+    record_ptr::Ptr{Ptr{FreeListRecord}})
 
     record = unsafe_load(record_ptr)
     next_record_ptr = @get_field_pointer(record, :next)
@@ -627,8 +627,8 @@ function gc_init!(
     gc_memory_end_ptr = master_region.start + master_region.size
 
     # Allocate a local arena pointer buffer.
-    local_arenas_bytesize = sizeof(Ptr{GCArenaRecord}) * local_arena_count
-    local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{GCArenaRecord}}, gc_memory_start_ptr)
+    local_arenas_bytesize = sizeof(Ptr{FreeListArena}) * local_arena_count
+    local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{FreeListArena}}, gc_memory_start_ptr)
 
     # Allocate the safepoint flag buffer.
     safepoint_bytesize = sizeof(SafepointState) * warp_count
@@ -672,11 +672,11 @@ end
 
 # Takes a zero-filled region of memory and turns it into a block
 # managed by the GC, prefixed with an allocation record.
-function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCAllocationRecord} where T
-    entry = Base.unsafe_convert(Ptr{GCAllocationRecord}, start_ptr)
+function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListRecord} where T
+    entry = Base.unsafe_convert(Ptr{FreeListRecord}, start_ptr)
     unsafe_store!(
         entry,
-        GCAllocationRecord(
+        FreeListRecord(
             Csize_t(start_ptr + size) - Csize_t(data_pointer(entry)),
             C_NULL))
     return entry
@@ -684,15 +684,15 @@ end
 
 # Takes a zero-filled region of memory and turns it into an arena
 # managed by the GC, prefixed with an arena record.
-function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{GCArenaRecord} where T
+function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} where T
     # Create a single free list entry.
-    first_entry_ptr = make_gc_block!(start_ptr + sizeof(GCArenaRecord), size - sizeof(GCArenaRecord))
+    first_entry_ptr = make_gc_block!(start_ptr + sizeof(FreeListArena), size - sizeof(FreeListArena))
 
     # Set up the arena record.
-    arena = Base.unsafe_convert(Ptr{GCArenaRecord}, start_ptr)
+    arena = Base.unsafe_convert(Ptr{FreeListArena}, start_ptr)
     unsafe_store!(
         arena,
-        GCArenaRecord(0, first_entry_ptr, C_NULL))
+        FreeListArena(0, first_entry_ptr, C_NULL))
 end
 
 # Tells if a GC heap contains a particular pointer.
@@ -728,7 +728,7 @@ end
 struct SortedAllocationList
     # An array of pointers to allocation records. The pointers
     # are all sorted.
-    records::Array{Ptr{GCAllocationRecord}, 1}
+    records::Array{Ptr{FreeListRecord}, 1}
 end
 
 length(alloc_list::SortedAllocationList) = length(alloc_list.records)
@@ -738,9 +738,9 @@ length(alloc_list::SortedAllocationList) = length(alloc_list.records)
 # such record.
 function get_record(
     alloc_list::SortedAllocationList,
-    pointer::Ptr{T})::Ptr{GCAllocationRecord} where T
+    pointer::Ptr{T})::Ptr{FreeListRecord} where T
 
-    cast_ptr = Base.unsafe_convert(Ptr{GCAllocationRecord}, pointer)
+    cast_ptr = Base.unsafe_convert(Ptr{FreeListRecord}, pointer)
 
     # Deal with the most common cases quickly.
     if length(alloc_list) == 0 ||
@@ -781,7 +781,7 @@ end
 # Iterates through a linked list of allocation records and apply a function
 # to every node in the linked list. The function is allowed to modify allocation
 # records.
-@inline function iterate_allocation_records(fun::Function, head::Ptr{GCAllocationRecord})
+@inline function iterate_allocation_records(fun::Function, head::Ptr{FreeListRecord})
     while head != C_NULL
         fun(head)
         head = unsafe_load(head).next
@@ -807,9 +807,9 @@ end
 #   2. reorder free blocks to put small blocks at the front
 #      of the free list,
 #   3. tally the total number of free bytes and return that number.
-function gc_compact_free_list(arena::Ptr{GCArenaRecord})::Csize_t
+function gc_compact_free_list(arena::Ptr{FreeListArena})::Csize_t
     # Let's start by creating a list of all free list records.
-    records = Ptr{GCAllocationRecord}[]
+    records = Ptr{FreeListRecord}[]
     free_list_head = unsafe_load(arena).free_list_head
     iterate_allocation_records(free_list_head) do record
         push!(records, record)
@@ -930,7 +930,7 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription,
         # Our mark phase is fairly simple: we maintain a worklist of pointers that
         # are live and may need to be processed, as well as a set of blocks that are
         # live and have already been processed.
-        live_blocks = Set{Ptr{GCAllocationRecord}}()
+        live_blocks = Set{Ptr{FreeListRecord}}()
         live_worklist = Ptr{ObjectRef}[]
 
         # Get a sorted allocation list, which will allow us to classify live pointers quickly.

From f4cdf0b934d324faeb04a06ae9d242d1bea09980 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 18 Mar 2019 18:48:54 +0100
Subject: [PATCH 064/146] Implement a ScatterAlloc-based allocator

---
 src/compiler/optim.jl   |   7 +-
 src/device/threading.jl |   6 +
 src/gc.jl               | 411 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 395 insertions(+), 29 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index cdd127de..5ccf964a 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -651,14 +651,19 @@ function insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function)
     # API doesn't expose.
 
     if has_gc_frame(fun)
+        safepoint_function = Runtime.get(:gc_safepoint)
         let builder = Builder(JuliaContext())
             for block in blocks(fun)
                 for instruction in instructions(block)
                     if is_non_intrinsic_call(instruction)
+                        if called_value(instruction) == safepoint_function
+                            continue
+                        end
+
                         # Insert a safepoint just before the call.
                         position!(builder, instruction)
                         debuglocation!(builder, instruction)
-                        call!(builder, Runtime.get(:gc_safepoint), LLVM.Value[])
+                        call!(builder, safepoint_function, LLVM.Value[])
                     end
                 end
             end
diff --git a/src/device/threading.jl b/src/device/threading.jl
index 951c20e8..a7de7645 100644
--- a/src/device/threading.jl
+++ b/src/device/threading.jl
@@ -34,6 +34,12 @@ function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T
     atomic_rmw!(Val(:add), lhs, rhs)
 end
 
+# Atomically subtracts a value from a variable pointed to by a pointer.
+# Returns the previous value stored in that variable.
+function atomic_subtract!(lhs::Ptr{T}, rhs::T)::T where T
+    atomic_rmw!(Val(:sub), lhs, rhs)
+end
+
 # Atomically computes the logical or of a value and a variable pointed
 # to by a pointer. Returns the previous value stored in that variable.
 function atomic_or!(lhs::Ptr{T}, rhs::T)::T where T
diff --git a/src/gc.jl b/src/gc.jl
index 2893cd7e..5830c7f3 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -98,6 +98,125 @@ struct FreeListArena
     allocation_list_head::Ptr{FreeListRecord}
 end
 
+# A data structure that describes a ScatterAlloc superblock. Every
+# superblock is prefixed by one of these.
+struct ScatterAllocSuperblock
+    # The number of regions in the superblock.
+    region_count::UInt32
+
+    # The number of pages in a region managed by this superblock.
+    pages_per_region::UInt32
+
+    # The size of a page in the superblock, in bytes. This size
+    # does not include the page's header.
+    page_size::UInt32
+
+    # A pointer to the next superblock.
+    next::Ptr{ScatterAllocSuperblock}
+end
+
+# A region in a ScatterAlloc superblock.
+struct ScatterAllocRegion
+    # The number of pages in this region that are full.
+    full_page_count::Int64
+end
+
+# A page in a ScatterAlloc region.
+struct ScatterAllocPage
+    # The size of a chunk in this page.
+    chunk_size::Int64
+
+    # The number of allocated blocks in this page.
+    allocated_chunk_count::Int64
+
+    # A bitmask that describes which chunks have been allocated
+    # and which chunks are still free.
+    occupancy::Int64
+end
+
+const gc_align = Csize_t(16)
+
+# Aligns a pointer to an alignment boundary.
+function align_downward(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T
+    address_int = Base.convert(Csize_t, address)
+    remainder = address_int % alignment
+    if remainder == Csize_t(0)
+        return address
+    else
+        return address + alignment - remainder
+    end
+end
+
+# Aligns a pointer to an alignment boundary.
+function align_upward(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T
+    result = align_downward(address, alignment)
+    if result < address
+        result += alignment
+    end
+    result
+end
+
+# Aligns a pointer to an alignment boundary.
+function align_upward(offset::T, alignment::Csize_t = gc_align)::T where T <: Integer
+    convert(T, Csize_t(align_upward(convert(Ptr{UInt8}, Csize_t(offset)), alignment)))
+end
+
+# Gets the page size in a superblock. This size does not include
+# the page header.
+function page_size(superblock::Ptr{ScatterAllocSuperblock})
+    unsafe_load(@get_field_pointer(superblock, :page_size))
+end
+
+# Gets the number of pages per region in a superblock.
+function pages_per_region(superblock::Ptr{ScatterAllocSuperblock})
+    unsafe_load(@get_field_pointer(superblock, :pages_per_region))
+end
+
+# Gets the size of an aligned header, including padding to satisfy
+# alignment requirements.
+@generated function header_size(::Type{T}, ::Val{alignment} = Val(gc_align))::UInt32 where {T, alignment}
+    result = align_upward(UInt32(sizeof(T)), alignment)
+    :($result)
+end
+
+# Gets the total number of chunks in a particular page.
+function chunk_count(page::Ptr{ScatterAllocPage}, superblock::Ptr{ScatterAllocSuperblock})
+    chunk_size = unsafe_load(@get_field_pointer(page, :chunk_size))
+    div(page_size(superblock), chunk_size)
+end
+
+# Gets the address of a particular chunk in a page. `index` is zero-based.
+function chunk_address(page::Ptr{ScatterAllocPage}, index::Integer)::Ptr{UInt8}
+    chunk_size = unsafe_load(@get_field_pointer(page, :chunk_size))
+    Base.unsafe_convert(Ptr{UInt8}, page + header_size(ScatterAllocPage) + chunk_size * index)
+end
+
+# Gets the address of a particular page in a region. `index` is zero-based.
+function page_address(region::Ptr{ScatterAllocRegion}, superblock::Ptr{ScatterAllocSuperblock}, index::Integer)::Ptr{ScatterAllocPage}
+    Base.unsafe_convert(
+        Ptr{ScatterAllocPage},
+        region + header_size(ScatterAllocRegion) + index * (header_size(ScatterAllocPage) + page_size(superblock)))
+end
+
+# Gets the total size in bytes of a region, including overhead.
+function region_bytesize(pages_per_region::Integer, page_size::Integer)
+    region_data_size = pages_per_region * (header_size(ScatterAllocPage) + page_size)
+    header_size(ScatterAllocRegion) + region_data_size
+end
+
+# Gets the address of a particular region in a superblock. `index` is zero-based.
+function region_address(superblock::Ptr{ScatterAllocSuperblock}, index::Integer)::Ptr{ScatterAllocRegion}
+    Base.unsafe_convert(
+        Ptr{ScatterAllocPage},
+        superblock + header_size(ScatterAllocSuperblock) + index * region_bytesize(pages_per_region(superblock), page_size(superblock)))
+end
+
+# A GC arena that uses the ScatterAlloc algorithm for allocations.
+struct ScatterAllocArena
+    # A pointer to the first superblock managed by this arena.
+    first_superblock::Ptr{ScatterAllocSuperblock}
+end
+
 # A reference to a Julia object.
 const ObjectRef = Ptr{Nothing}
 
@@ -135,6 +254,10 @@ struct GCMasterRecord
     # The number of local arenas.
     local_arena_count::UInt32
 
+    # A pointer to the tiny arena, which uses the ScatterAlloc
+    # algorithm to provision space for small objects.
+    tiny_arena::Ptr{ScatterAllocArena}
+
     # A pointer to a list of local GC arena pointers.
     local_arenas::Ptr{Ptr{FreeListArena}}
 
@@ -293,19 +416,6 @@ macro perma_safepoint(expr)
     end
 end
 
-const gc_align = Csize_t(16)
-
-# Aligns a pointer to an alignment boundary.
-function align_to_boundary(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T
-    address_int = Base.convert(Csize_t, address)
-    remainder = address_int % alignment
-    if remainder == Csize_t(0)
-        return address
-    else
-        return address + alignment - remainder
-    end
-end
-
 # Tries to use a free-list entry to allocate a chunk of data of size `bytesize`.
 # Updates the free list if the allocation succeeds. Returns a null pointer otherwise.
 function gc_use_free_list_entry(
@@ -332,7 +442,7 @@ function gc_use_free_list_entry(
     # Compute the start address of the new free list entry. The data
     # prefixed by the block needs to be aligned to a 16-byte boundary,
     # but the block itself doesn't.
-    new_data_address = align_to_boundary(data_address + bytesize)
+    new_data_address = align_downward(data_address + bytesize)
     new_entry_address = new_data_address - sizeof(FreeListRecord)
     if new_entry_address < data_address + bytesize
         new_entry_address += gc_align
@@ -378,6 +488,181 @@ function gc_use_free_list_entry(
     return data_address
 end
 
+# Tries to allocate a chunk of memory from a ScatterAlloc page.
+# Returns a null pointer if no chunk of memory can be found.
+function gc_scatter_alloc_use_page(
+    page::Ptr{ScatterAllocPage},
+    region::Ptr{ScatterAllocRegion},
+    superblock::Ptr{ScatterAllocSuperblock})::Ptr{UInt8}
+
+    alloc_chunk_ptr = @get_field_pointer(page, :allocated_chunk_count)
+    fill_level = atomic_add!(alloc_chunk_ptr, 1)
+    spots = chunk_count(page, superblock)
+    if fill_level < spots
+        if fill_level + 1 == spots
+            # The page is full now. Increment the region's counter.
+            full_page_ptr = @get_field_pointer(region, :full_page_count)
+            atomic_add!(full_page_ptr, 1)
+        end
+
+        lane_id = (get_thread_id() - 1) % warpsize()
+        spot = lane_id % spots
+        occupancy_ptr = @get_field_pointer(page, :occupancy)
+        while true
+            # Check if our preferred spot is available.
+            mask = 1 << spot
+            old = atomic_or!(occupancy_ptr, mask)
+
+            actual_fill = 0
+            for i in 1:64
+                if old & (1 << (i - 1)) != 0
+                    actual_fill += 1
+                end
+            end
+
+            # If the spot is available, then use it.
+            if old & mask == 0
+                break
+            end
+
+            # Otherwise, find a new spot.
+            spot = (spot + 1) % spots
+        end
+        return chunk_address(page, spot)
+    end
+
+    # The page is full.
+    atomic_subtract!(alloc_chunk_ptr, 1)
+    return C_NULL
+end
+
+function scatter_alloc_hash(
+    superblock::Ptr{ScatterAllocSuperblock},
+    bytesize::Int64)::Int64
+
+    sb = unsafe_load(superblock)
+    page_count = sb.region_count * sb.pages_per_region
+    warp_id = get_warp_id() - 1
+
+    k_S = 38183
+    k_mp = 17497
+
+    (bytesize * k_S + warp_id * k_mp) % page_count
+end
+
+# Tries to allocate a chunk of memory from a ScatterAlloc superblock.
+# Returns a null pointer if no sufficiently large chunk of
+# memory can be found.
+function gc_scatter_alloc_use_superblock(
+    superblock::Ptr{ScatterAllocSuperblock},
+    bytesize::Csize_t)::Ptr{UInt8}
+
+    if bytesize > page_size(superblock)
+        # This isn't going to work. The superblock's page size is just too small.
+        return C_NULL
+    end
+
+    # Choose the allocation size in such a way that we never end up with more than
+    # 64 chunks. This is necessary because the chunk occupancy bitfield is only
+    # 64 bits wide.
+    alloc_size = Int64(div(page_size(superblock), 64))
+    if alloc_size < Int64(bytesize)
+        alloc_size = Int64(bytesize)
+    end
+
+    # Align the allocation size.
+    alloc_size = align_upward(alloc_size)
+
+    # We are looking for a chunk that is `bytesize` bytes in size,
+    # but we're willing to accept a chunk that is twice as large.
+    waste_factor = 2
+    max_size = alloc_size * waste_factor
+
+    pages_per_region = unsafe_load(@get_field_pointer(superblock, :pages_per_region))
+    region_count = unsafe_load(@get_field_pointer(superblock, :region_count))
+
+    # Guess a global page index.
+    global_page_id = scatter_alloc_hash(superblock, alloc_size)
+
+    # Decompose that global page index into a region index and a
+    # local page index.
+    region_id = global_page_id % pages_per_region
+    page_id = div(global_page_id, pages_per_region)
+
+    # Remember the initial values of the region and page ids.
+    init_region_id = region_id
+    init_page_id = page_id
+
+    # Find the region and page corresponding to the current page ID.
+    region = region_address(superblock, region_id)
+    while true
+        page = page_address(region, superblock, page_id)
+
+        # Skip regions until we find a region that is sufficiently empty.
+        while true
+            region_fill_level = unsafe_load(region).full_page_count / pages_per_region
+            if region_fill_level > 0.9
+                region_id += 1
+                if region_id >= region_count
+                    region_id = 0
+                end
+                region = region_address(superblock, region_id)
+                page_id = 0
+            else
+                break
+            end
+        end
+
+        # Try to set the chunk size to our preferred chunk size.
+        chunk_size_ptr = @get_field_pointer(page, :chunk_size)
+        chunk_size = atomic_compare_exchange!(chunk_size_ptr, 0, alloc_size)
+        if chunk_size == 0 || (chunk_size >= alloc_size && chunk_size <= max_size)
+            # If we managed to set the page's chunk size, then the page is definitely
+            # suitable for our purposes. Otherwise, the page might still be suitable
+            # if its chunk size is sufficiently large to accommodate the requested
+            # size yet small enough to not waste too much space.
+            result = gc_scatter_alloc_use_page(page, region, superblock)
+            if result != C_NULL
+                return result
+            end
+        end
+
+        # Try the next page.
+        page_id += 1
+
+        if page_id >= pages_per_region
+            region_id += 1
+            if region_id >= region_count
+                region_id = 0
+            end
+            region = region_address(superblock, region_id)
+            page_id = 0
+        end
+
+        # We tried every page in the entire superblock and found nothing.
+        if region_id == init_region_id && page_id == init_page_id
+            return C_NULL
+        end
+    end
+end
+
+# Tries to allocate a chunk of memory in a particular GC arena.
+# Returns a null pointer if no sufficiently large chunk of
+# memory can be found.
+function gc_malloc_local(arena::Ptr{ScatterAllocArena}, bytesize::Csize_t)::Ptr{UInt8}
+    # Walk the list of superblocks until we find a valid candidate.
+    superblock = unsafe_load(arena).first_superblock
+    while superblock != C_NULL
+        result = gc_scatter_alloc_use_superblock(superblock, bytesize)
+        if result != C_NULL
+            return result
+        end
+        superblock = unsafe_load(@get_field_pointer(superblock, :next))
+    end
+
+    return C_NULL
+end
+
 # Tries to allocate a chunk of memory from a free list.
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
@@ -441,24 +726,35 @@ This function is designed to be called by the device.
 function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     master_record = get_gc_master_record()
 
-    # Try to malloc the object without host intervention.
-    ptr = @perma_safepoint @nocollect begin
-        # Try to allocate in the local arena first. If that doesn't
+    function allocate()
+        # Try to allocate in the tiny arena first. The ScatterAlloc
+        # algorithm used by that arena is lock-free and works well
+        # for small objects.
+        if master_record.tiny_arena != C_NULL
+            local_ptr = gc_malloc_local(master_record.tiny_arena, bytesize)
+            if local_ptr != C_NULL
+                return local_ptr
+            end
+        end
+
+        # Try to allocate in the local arena second. If that doesn't
         # work, we'll move on to the global arena, which is bigger but
         # is shared by all threads. (We want to minimize contention
         # on the global arena's lock.)
         local_arena = get_local_arena()
-        local_ptr = Base.unsafe_convert(Ptr{UInt8}, C_NULL)
         if local_arena != C_NULL
             local_ptr = gc_malloc_local(local_arena, bytesize)
+            if local_ptr != C_NULL
+                return local_ptr
+            end
         end
 
-        if local_ptr == C_NULL
-            gc_malloc_local(master_record.global_arena, bytesize)
-        else
-            local_ptr
-        end
+        # Try to use the global arena if all else fails.
+        gc_malloc_local(master_record.global_arena, bytesize)
     end
+
+    # Try to malloc the object without host intervention.
+    ptr = @perma_safepoint @nocollect allocate()
     if ptr != C_NULL
         return ptr
     end
@@ -565,7 +861,7 @@ end
 # One megabyte.
 const MiB = 1 << 20
 
-# The initial size of the GC heap, currently 16 MiB.
+# The initial size of the GC heap, currently 20 MiB.
 const initial_gc_heap_size = 16 * MiB
 
 # The default capacity of a root buffer, i.e., the max number of
@@ -589,6 +885,14 @@ const global_arena_starvation_threshold = 4 * MiB
 # The arena starvation threshold is currently set to 1 MiB.
 const local_arena_starvation_threshold = 1 * MiB
 
+# The point at which a tiny arena is deemed to be starving, i.e.,
+# it no longer contains enough memory to perform basic allocations.
+# If a tiny arena's free byte count stays below the arena starvation
+# threshold after a collection phase, the collector will allocate
+# additional memory to the arena such that it is no longer starving.
+# This arena starvation threshold is currently set to 2 MiB.
+const tiny_arena_starvation_threshold = 0 # 2 * MiB
+
 # A description of a region of memory that has been allocated to the GC heap.
 struct GCHeapRegion
     # A buffer that contains the GC region's bytes.
@@ -645,24 +949,33 @@ function gc_init!(
         unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * root_buffer_capacity, i)
     end
 
-    # Compute a pointer to the start of the first arena.
+    # Compute a pointer to the start of the tiny arena.
     arena_start_ptr = rootbuf_ptr + rootbuf_bytesize
 
+    # Set up the tiny object arena.
+    if tiny_arena_starvation_threshold > 0
+        arena_for_ants = make_gc_arena!(ScatterAllocArena, arena_start_ptr, Csize_t(tiny_arena_starvation_threshold))
+        arena_start_ptr += tiny_arena_starvation_threshold
+    else
+        arena_for_ants = Base.unsafe_convert(Ptr{ScatterAllocArena}, C_NULL)
+    end
+
     # Set up local arenas.
     for i in 1:local_arena_count
-        local_arena = make_gc_arena!(arena_start_ptr, Csize_t(local_arena_starvation_threshold))
+        local_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold))
         unsafe_store!(local_arenas_ptr, local_arena, i)
         arena_start_ptr += local_arena_starvation_threshold
     end
 
     # Set up the global arena.
-    global_arena = make_gc_arena!(arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
+    global_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
 
     return GCMasterRecord(
         warp_count,
         UInt32(thread_count),
         root_buffer_capacity,
         UInt32(local_arena_count),
+        arena_for_ants,
         local_arenas_ptr,
         global_arena,
         safepoint_ptr,
@@ -684,7 +997,7 @@ end
 
 # Takes a zero-filled region of memory and turns it into an arena
 # managed by the GC, prefixed with an arena record.
-function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} where T
+function make_gc_arena!(::Type{FreeListArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} where T
     # Create a single free list entry.
     first_entry_ptr = make_gc_block!(start_ptr + sizeof(FreeListArena), size - sizeof(FreeListArena))
 
@@ -693,6 +1006,48 @@ function make_gc_arena!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} wh
     unsafe_store!(
         arena,
         FreeListArena(0, first_entry_ptr, C_NULL))
+
+    arena
+end
+
+# Takes a zero-filled region of memory and turns it into a ScatterAlloc
+# superblock.
+function make_gc_superblock!(
+    start_ptr::Ptr{T},
+    size::Csize_t;
+    page_size::UInt32 = UInt32(2048),
+    pages_per_region::UInt32 = UInt32(16))::Ptr{ScatterAllocSuperblock} where T
+
+    region_size = region_bytesize(pages_per_region, page_size)
+
+    # Figure out how many regions we can allocate.
+    region_count = div(size - header_size(ScatterAllocSuperblock), region_size)
+
+    # At this point, we'd normally allocate regions and pages.
+    # However, region and page headers are zero-initialized by default.
+    # So we don't actually need to do anything to set up the regions
+    # and pages.
+
+    # Allocate the superblock header.
+    superblock = Base.unsafe_convert(Ptr{ScatterAllocSuperblock}, align_upward(start_ptr))
+    unsafe_store!(
+        superblock,
+        ScatterAllocSuperblock(region_count, pages_per_region, page_size, C_NULL))
+
+    superblock
+end
+
+# Takes a zero-filled region of memory and turns it into an arena
+# managed by the GC, prefixed with an arena record.
+function make_gc_arena!(::Type{ScatterAllocArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{ScatterAllocArena} where T
+    superblock_ptr = align_upward(start_ptr + sizeof(ScatterAllocArena))
+    superblock = make_gc_superblock!(superblock_ptr, Csize_t(start_ptr) + size - Csize_t(superblock_ptr))
+    arena = Base.unsafe_convert(Ptr{ScatterAllocArena}, start_ptr)
+    unsafe_store!(
+        arena,
+        ScatterAllocArena(superblock))
+
+    arena
 end
 
 # Tells if a GC heap contains a particular pointer.

From 6e14a2d5cb94a66931ec9eaa230e3b7c185b598c Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 19 Mar 2019 16:18:07 +0100
Subject: [PATCH 065/146] Make the allocator smarter

---
 src/gc.jl | 400 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 348 insertions(+), 52 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 5830c7f3..07f5004d 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -98,6 +98,9 @@ struct FreeListArena
     allocation_list_head::Ptr{FreeListRecord}
 end
 
+# Gets a free list arena's lock.
+get_lock(arena::Ptr{FreeListArena}) = ReaderWriterLock(@get_field_pointer(arena, :lock_state))
+
 # A data structure that describes a ScatterAlloc superblock. Every
 # superblock is prefixed by one of these.
 struct ScatterAllocSuperblock
@@ -217,6 +220,66 @@ struct ScatterAllocArena
     first_superblock::Ptr{ScatterAllocSuperblock}
 end
 
+# A "shelf" in a bodega arena. See `BodegaArena` for more info on
+# how shelves work.
+struct BodegaShelf
+    # The size of the chunks on this shelf.
+    chunk_size::Csize_t
+
+    # The maximal number of chunks on this shelf.
+    capacity::Int64
+
+    # An index into the shelf that points to the first free
+    # chunk. This is a zero-based index.
+    chunk_finger::Int64
+
+    # A pointer to an array of pointers to chunks of memory.
+    # Every chunk in this array has a chunk size that is
+    # at least as large as `chunk_size`.
+    chunks::Ptr{Ptr{UInt8}}
+end
+
+# A GC arena that uses a custom ("bodega") allocation algorithm for allocations.
+# Essentially, this type of arena has a list of "shelves" that contain small,
+# preallocated chunks of memory that threads can claim in a fast and lock-free
+# manner. When the shelves run out of memory, threads may re-stock them from free
+# list, amortizing the cost of lock acquisition across many different allocations.
+struct BodegaArena
+    # The number of shelves in the arena.
+    shelf_count::Int
+
+    # A pointer to an array of shelves.
+    shelves::Ptr{BodegaShelf}
+
+    # A Boolean that tells if it is sensible to try and restock shelves in this
+    # arena. Restocking shelves becomes futile once the free list's capacity is
+    # exhausted.
+    can_restock::Bool
+
+    # The free list this bodega uses for large allocations and for re-stocking
+    # the shelves.
+    free_list::FreeListArena
+end
+
+# Gets a pointer to a bodega arena's free list.
+function get_free_list(arena::Ptr{BodegaArena})::Ptr{FreeListArena}
+    @get_field_pointer(arena, :free_list)
+end
+
+# Gets the first shelf containing chunks that are at least `bytesize` bytes
+# in size. Returns null if there is no such shelf.
+function get_shelf(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{BodegaShelf}
+    bodega = unsafe_load(arena)
+    for i in 1:bodega.shelf_count
+        shelf = bodega.shelves + (i - 1) * sizeof(BodegaShelf)
+        chunk_size = unsafe_load(@get_field_pointer(shelf, :chunk_size))
+        if chunk_size >= bytesize
+            return shelf
+        end
+    end
+    return C_NULL
+end
+
 # A reference to a Julia object.
 const ObjectRef = Ptr{Nothing}
 
@@ -259,10 +322,10 @@ struct GCMasterRecord
     tiny_arena::Ptr{ScatterAllocArena}
 
     # A pointer to a list of local GC arena pointers.
-    local_arenas::Ptr{Ptr{FreeListArena}}
+    local_arenas::Ptr{Ptr{BodegaArena}}
 
     # A pointer to the global GC arena.
-    global_arena::Ptr{FreeListArena}
+    global_arena::Ptr{BodegaArena}
 
     # A pointer to a list of safepoint flags. Every warp has its
     # own flag.
@@ -311,14 +374,14 @@ end
 
 # Gets a pointer to the local arena for this thread. This
 # pointer may be null if there are no local arenas.
-@inline function get_local_arena()::Ptr{FreeListArena}
+@inline function get_local_arena()::Ptr{BodegaArena}
     master_record = get_gc_master_record()
     if master_record.local_arena_count == UInt32(0)
         return C_NULL
     else
         return unsafe_load(
             master_record.local_arenas,
-            get_thread_id() % master_record.local_arena_count)
+            get_warp_id() % master_record.local_arena_count)
     end
 end
 
@@ -422,7 +485,7 @@ function gc_use_free_list_entry(
     entry_ptr::Ptr{Ptr{FreeListRecord}},
     allocation_list_ptr::Ptr{Ptr{FreeListRecord}},
     entry::Ptr{FreeListRecord},
-    bytesize::Csize_t,)::Ptr{UInt8}
+    bytesize::Csize_t)::Ptr{UInt8}
 
     entry_data = unsafe_load(entry)
     if entry_data.size < bytesize
@@ -693,27 +756,137 @@ function gc_malloc_from_free_list(
     return C_NULL
 end
 
+# Tries to allocate a chunk of memory from a free list.
+# Returns a null pointer if no sufficiently large chunk of
+# memory can be found.
+#
+# This function is not thread-safe.
+function gc_malloc_from_free_list(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8}
+    free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}}
+    allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}}
+    gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
+end
+
+# Writes a pointer to a temporary GC frame. This will keep the pointer
+# from getting collected until the caller has a chance to add it to its
+# own GC frame.
+function gc_protect(pointer::Ptr{UInt8})
+    if pointer != Base.unsafe_convert(Ptr{UInt8}, C_NULL)
+        gc_frame = new_gc_frame(UInt32(1))
+        unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, pointer))
+    end
+end
+
 # Tries to allocate a chunk of memory in a particular GC arena.
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
 function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8}
     # Acquire the arena's lock.
-    arena_lock = ReaderWriterLock(@get_field_pointer(arena, :lock_state))
-    result_ptr = writer_locked(arena_lock) do
+    result_ptr = writer_locked(get_lock(arena)) do
         # Allocate a suitable region of memory.
-        free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}}
-        allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}}
-        gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
+        gc_malloc_from_free_list(arena, bytesize)
     end
 
     # If the resulting pointer is non-null, then we'll write it to a temporary GC frame.
     # Our reasoning for doing this is that doing so ensures that the allocated memory
     # won't get collected by the GC before the caller has a chance to add it to its
     # own GC frame.
-    if result_ptr != Base.unsafe_convert(Ptr{UInt8}, C_NULL)
-        gc_frame = new_gc_frame(UInt32(1))
-        unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, result_ptr))
+    gc_protect(result_ptr)
+    return result_ptr
+end
+
+# Atomically takes a chunk from a shelf. Returns null if the shelf
+# is empty.
+function gc_malloc_from_shelf(shelf::Ptr{BodegaShelf})::Ptr{UInt8}
+    capacity = unsafe_load(@get_field_pointer(shelf, :capacity))
+
+    # Atomically increment the chunk finger.
+    finger_ptr = @get_field_pointer(shelf, :chunk_finger)
+    finger = atomic_add!(finger_ptr, 1)
+
+    if finger < capacity
+        # If the chunk finger was less than the capacity, then we actually
+        # managed to take a chunk from the shelf. We only need to retrieve
+        # its address.
+        chunk_array = unsafe_load(@get_field_pointer(shelf, :chunks))
+        return unsafe_load(chunk_array, finger + 1)
+    else
+        # Otherwise, we've got nothing. Return null.
+        return C_NULL
+    end
+end
+
+# Re-stocks a shelf.
+function restock_shelf(arena::Ptr{BodegaArena}, shelf::Ptr{BodegaShelf})
+    shelf_size = unsafe_load(@get_field_pointer(shelf, :chunk_size))
+    capacity = unsafe_load(@get_field_pointer(shelf, :capacity))
+    finger_ptr = @get_field_pointer(shelf, :chunk_finger)
+    finger = unsafe_load(finger_ptr)
+
+    # The finger may exceed the capacity. This is harmless. Just
+    # reset the finger to the capacity.
+    if finger > capacity
+        finger = capacity
+    end
+
+    # Actually re-stock the shelf.
+    free_list = get_free_list(arena)
+    chunk_array = unsafe_load(@get_field_pointer(shelf, :chunks))
+    while finger > 0
+        chunk = gc_malloc_from_free_list(free_list, shelf_size)
+        if chunk == C_NULL
+            # We exhausted the free list. Better break now. Also set
+            # the arena's `can_restock` flag to false so there will be
+            # no future attempts to re-stock shelves.
+            unsafe_store!(@get_field_pointer(arena, :can_restock), false)
+            break
+        end
+
+        # Update the chunk array.
+        unsafe_store!(chunk_array, chunk, finger)
+        finger -= 1
     end
+
+    # Update the finger.
+    unsafe_store!(finger_ptr, finger)
+end
+
+# Tries to allocate a chunk of memory in a particular GC arena.
+# Returns a null pointer if no sufficiently large chunk of
+# memory can be found.
+function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{UInt8}
+    # The bodega arena might be empty (or approximately empty). If so, then we'll
+    # just return null early. There's no need to scrape the bottom of the barrel.
+    if !unsafe_load(@get_field_pointer(arena, :can_restock))
+        return C_NULL
+    end
+
+    # Find the right shelf for this allocation.
+    shelf = get_shelf(arena, bytesize)
+    free_list = get_free_list(arena)
+    if shelf == C_NULL
+        # The shelves' chunk sizes are all too small to accommodate this
+        # allocation. Use the free list directly.
+        return gc_malloc_local(free_list, bytesize)
+    end
+
+    # Acquire a reader lock on the arena and try to take a chunk
+    # from the shelf.
+    lock = get_lock(free_list)
+    result_ptr = reader_locked(lock) do
+        gc_malloc_from_shelf(shelf)
+    end
+
+    if result_ptr == C_NULL
+        # Looks like we need to re-stock the shelf. While we're at it,
+        # we might as well grab a chunk of memory for ourselves.
+        result_ptr = writer_locked(lock) do
+            restock_shelf(arena, shelf)
+            gc_malloc_from_free_list(free_list, bytesize)
+        end
+    end
+
+    gc_protect(result_ptr)
     return result_ptr
 end
 
@@ -931,8 +1104,8 @@ function gc_init!(
     gc_memory_end_ptr = master_region.start + master_region.size
 
     # Allocate a local arena pointer buffer.
-    local_arenas_bytesize = sizeof(Ptr{FreeListArena}) * local_arena_count
-    local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{FreeListArena}}, gc_memory_start_ptr)
+    local_arenas_bytesize = sizeof(Ptr{BodegaArena}) * local_arena_count
+    local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{BodegaArena}}, gc_memory_start_ptr)
 
     # Allocate the safepoint flag buffer.
     safepoint_bytesize = sizeof(SafepointState) * warp_count
@@ -962,13 +1135,13 @@ function gc_init!(
 
     # Set up local arenas.
     for i in 1:local_arena_count
-        local_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold))
+        local_arena = make_gc_arena!(BodegaArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold))
         unsafe_store!(local_arenas_ptr, local_arena, i)
         arena_start_ptr += local_arena_starvation_threshold
     end
 
     # Set up the global arena.
-    global_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
+    global_arena = make_gc_arena!(BodegaArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
 
     return GCMasterRecord(
         warp_count,
@@ -1010,6 +1183,49 @@ function make_gc_arena!(::Type{FreeListArena}, start_ptr::Ptr{T}, size::Csize_t)
     arena
 end
 
+# Takes a zero-filled region of memory and turns it into an arena
+# managed by the GC, prefixed with an arena record.
+function make_gc_arena!(::Type{BodegaArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{BodegaArena} where T
+    current_ptr = start_ptr + sizeof(BodegaArena)
+
+    # Set up some shelf chunk arrays
+    shelf_records = []
+    for chunk_size in [32, 64]
+        capacity = 2048
+        shelf_chunk_array = Base.unsafe_convert(Ptr{Ptr{UInt8}}, current_ptr)
+        current_ptr += capacity * sizeof(Ptr{UInt8})
+        push!(shelf_records, BodegaShelf(Csize_t(chunk_size), capacity, capacity, shelf_chunk_array))
+    end
+
+    # Set up the shelves.
+    shelf_array = Base.unsafe_convert(Ptr{BodegaShelf}, current_ptr)
+    for record in shelf_records
+        shelf = Base.unsafe_convert(Ptr{BodegaShelf}, current_ptr)
+        current_ptr += sizeof(BodegaShelf)
+        unsafe_store!(shelf, record)
+    end
+
+    # Set up a free list entry.
+    first_entry_ptr = make_gc_block!(current_ptr, Csize_t(start_ptr + size) - Csize_t(current_ptr))
+
+    # Set up the arena record.
+    arena = Base.unsafe_convert(Ptr{BodegaArena}, start_ptr)
+    unsafe_store!(
+        arena,
+        BodegaArena(
+            length(shelf_records),
+            shelf_array,
+            true,
+            FreeListArena(0, first_entry_ptr, C_NULL)))
+
+    # Stock the shelves.
+    for record in shelf_records
+        restock_shelf(arena, get_shelf(arena, record.chunk_size))
+    end
+
+    arena
+end
+
 # Takes a zero-filled region of memory and turns it into a ScatterAlloc
 # superblock.
 function make_gc_superblock!(
@@ -1134,22 +1350,54 @@ function get_record(
 end
 
 # Iterates through a linked list of allocation records and apply a function
-# to every node in the linked list. The function is allowed to modify allocation
-# records.
-@inline function iterate_allocation_records(fun::Function, head::Ptr{FreeListRecord})
+# to every node in the linked list.
+function iterate_allocation_records(fun::Function, head::Ptr{FreeListRecord})
     while head != C_NULL
         fun(head)
         head = unsafe_load(head).next
     end
 end
 
+# Iterates through all active allocation records in a GC arena.
+function iterate_allocated(fun::Function, arena::Ptr{FreeListArena})
+    allocation_list_head = unsafe_load(arena).allocation_list_head
+    iterate_allocation_records(fun, allocation_list_head)
+end
+
+# Iterates through all active allocation records in a GC arena.
+function iterate_allocated(fun::Function, arena::Ptr{BodegaArena})
+    # Compose a set that contains all data addresses of chunks that
+    # are on the shelves.
+    arena_data = unsafe_load(arena)
+    chunks_on_shelves = Set{Ptr{UInt8}}()
+    for i in 1:arena_data.shelf_count
+        shelf = unsafe_load(arena_data.shelves, i)
+        for j in shelf.chunk_finger:(shelf.capacity - 1)
+            push!(chunks_on_shelves, unsafe_load(shelf.chunks, j))
+        end
+    end
+
+    # Now iterate through the allocation list, ignoring records that have
+    # been placed on the shelves.
+    iterate_allocated(get_free_list(arena)) do record
+        if !(data_pointer(record) in chunks_on_shelves)
+            fun(record)
+        end
+    end
+end
+
+# Iterates through all free allocation records in a GC arena.
+function iterate_free(fun::Function, arena::Ptr{FreeListArena})
+    free_list_head = unsafe_load(arena).free_list_head
+    iterate_allocation_records(fun, free_list_head)
+end
+
 # Takes a GC master record and constructs a sorted allocation list
 # based on it.
 function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationList
     records = []
     iterate_arenas(master_record) do arena
-        allocation_list_head = unsafe_load(arena).allocation_list_head
-        iterate_allocation_records(allocation_list_head) do record
+        iterate_allocated(arena) do record
             push!(records, record)
         end
     end
@@ -1157,16 +1405,46 @@ function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationLi
     return SortedAllocationList(records)
 end
 
+# Frees all dead blocks in an arena.
+function gc_free_garbage(arena::Ptr{FreeListArena}, live_blocks::Set{Ptr{FreeListRecord}})
+    record_ptr = @get_field_pointer(arena, :allocation_list_head)
+    while true
+        record = unsafe_load(record_ptr)
+        if record == C_NULL
+            # We've reached the end of the list.
+            break
+        end
+
+        if record in live_blocks
+            # We found a live block. Proceed to the next block.
+            record_ptr = @get_field_pointer(record, :next)
+        else
+            # We found a dead block. Release it. Don't proceed to the
+            # next block because the current block will change in the
+            # next iteration of this loop.
+            gc_free_local(arena, record_ptr)
+        end
+    end
+end
+
+# Frees all dead blocks in an arena.
+function gc_free_garbage(arena::Ptr{BodegaArena}, live_blocks::Set{Ptr{FreeListRecord}})
+    # Free garbage in the free list sub-arena.
+    gc_free_garbage(get_free_list(arena), live_blocks)
+
+    # Mark the arena as ready for restocking.
+    unsafe_store!(@get_field_pointer(arena, :can_restock), true)
+end
+
 # Compact a GC arena's free list. This function will
 #   1. merge adjancent free blocks, and
 #   2. reorder free blocks to put small blocks at the front
 #      of the free list,
 #   3. tally the total number of free bytes and return that number.
-function gc_compact_free_list(arena::Ptr{FreeListArena})::Csize_t
+function gc_compact(arena::Ptr{FreeListArena})::Csize_t
     # Let's start by creating a list of all free list records.
     records = Ptr{FreeListRecord}[]
-    free_list_head = unsafe_load(arena).free_list_head
-    iterate_allocation_records(free_list_head) do record
+    iterate_free(arena) do record
         push!(records, record)
     end
 
@@ -1207,6 +1485,46 @@ function gc_compact_free_list(arena::Ptr{FreeListArena})::Csize_t
     return sum(record -> unsafe_load(record).size, records)
 end
 
+# Compact a GC arena's free list. This function will
+#   1. merge adjancent free blocks, and
+#   2. reorder free blocks to put small blocks at the front
+#      of the free list,
+#   3. tally the total number of free bytes and return that number.
+function gc_compact(arena::Ptr{BodegaArena})::Csize_t
+    # Compact the free list.
+    tally = gc_compact(get_free_list(arena))
+
+    # Add the size of the chunks on shelves to the tally.
+    shelf_count = unsafe_load(@get_field_pointer(arena, :shelf_count))
+    for i in 1:shelf_count
+        shelf_array = unsafe_load(@get_field_pointer(arena, :shelves))
+        shelf_data = unsafe_load(shelf_array, i)
+
+        finger = shelf_data.chunk_finger
+        if finger > shelf_data.capacity
+            finger = shelf_data.capacity
+        end
+        tally += shelf_data.chunk_size * (shelf_data.capacity - finger)
+    end
+
+    tally
+end
+
+# Expands a GC arena by assigning it an additional heap region.
+function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion)
+    extra_record = make_gc_block!(region.start, region.size)
+    last_free_list_ptr = @get_field_pointer(arena, :free_list_head)
+    iterate_free(arena) do record
+        last_free_list_ptr = @get_field_pointer(record, :next)
+    end
+    unsafe_store!(last_free_list_ptr, extra_record)
+end
+
+# Expands a GC arena by assigning it an additional heap region.
+function gc_expand(arena::Ptr{BodegaArena}, region::GCHeapRegion)
+    gc_expand(get_free_list(arena), region)
+end
+
 """A report of the GC's actions."""
 mutable struct GCReport
     """The total wall-clock time of a kernel execution."""
@@ -1323,27 +1641,11 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription,
         # free dead blocks. Next, we will compact and reorder free lists to combat
         # fragmentation.
         iterate_arenas(master_record) do arena
-            record_ptr = @get_field_pointer(arena, :allocation_list_head)
-            while true
-                record = unsafe_load(record_ptr)
-                if record == C_NULL
-                    # We've reached the end of the list.
-                    break
-                end
+            # Free garbage blocks.
+            gc_free_garbage(arena, live_blocks)
 
-                if record in live_blocks
-                    # We found a live block. Proceed to the next block.
-                    record_ptr = @get_field_pointer(record, :next)
-                else
-                    # We found a dead block. Release it. Don't proceed to the
-                    # next block because the current block will change in the
-                    # next iteration of this loop.
-                    gc_free_local(arena, record_ptr)
-                end
-            end
-
-            # Compact the free list.
-            free_memory = gc_compact_free_list(arena)
+            # Compact the arena.
+            free_memory = gc_compact(arena)
 
             # If the amount of free memory in the arena is below the starvation
             # limit then we'll expand the GC heap and add the additional memory
@@ -1356,19 +1658,13 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription,
 
             if free_memory < threshold
                 region = expand!(heap, threshold)
+                gc_expand(arena, region)
 
                 if arena == master_record.global_arena
                     report.extra_global_memory += Csize_t(threshold)
                 else
                     report.extra_local_memory += Csize_t(threshold)
                 end
-
-                extra_record = make_gc_block!(region.start, region.size)
-                last_free_list_ptr = @get_field_pointer(arena, :free_list_head)
-                iterate_allocation_records(unsafe_load(last_free_list_ptr)) do record
-                    last_free_list_ptr = @get_field_pointer(record, :next)
-                end
-                unsafe_store!(last_free_list_ptr, extra_record)
             end
         end
     end

From 699fcea7304509f35c3e3f1e44035c592624fd64 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 20 Mar 2019 10:49:22 +0100
Subject: [PATCH 066/146] Tweak GC memory hierarchy

---
 src/gc.jl | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 07f5004d..742559d0 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -325,7 +325,7 @@ struct GCMasterRecord
     local_arenas::Ptr{Ptr{BodegaArena}}
 
     # A pointer to the global GC arena.
-    global_arena::Ptr{BodegaArena}
+    global_arena::Ptr{FreeListArena}
 
     # A pointer to a list of safepoint flags. Every warp has its
     # own flag.
@@ -435,7 +435,7 @@ Signals that this warp has reached a GC safepoint.
 """
 function gc_safepoint()
     wait_for_interrupt() do
-        gc_set_safepoint_flag(in_safepoint)
+        gc_set_safepoint_flag(in_safepoint; overwrite = false)
     end
     return
 end
@@ -459,11 +459,15 @@ function gc_perma_safepoint()
 end
 
 # Sets this warp's safepoint flag to a particular state.
-function gc_set_safepoint_flag(value::SafepointState)
+function gc_set_safepoint_flag(value::SafepointState; overwrite::Bool = true)
     master_record = get_gc_master_record()
     warp_id = get_warp_id()
     safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointState) * (warp_id - 1)
-    volatile_store!(safepoint_flag_ptr, value)
+    if overwrite
+        volatile_store!(safepoint_flag_ptr, value)
+    else
+        atomic_compare_exchange!(safepoint_flag_ptr, not_in_safepoint, value)
+    end
     return
 end
 
@@ -922,8 +926,16 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
             end
         end
 
-        # Try to use the global arena if all else fails.
-        gc_malloc_local(master_record.global_arena, bytesize)
+        # Try to use the global arena if all else fails, but only if the chunk
+        # of memory we want to allocate is sufficiently large. Allocating lots of
+        # small chunks in the global arena will result in undue contention and slow
+        # down kernels dramatically.
+        if bytesize >= 1024
+            local_ptr = gc_malloc_local(master_record.global_arena, bytesize)
+        else
+            local_ptr = C_NULL
+        end
+        return local_ptr
     end
 
     # Try to malloc the object without host intervention.
@@ -939,11 +951,7 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
         # first thread that acquires the interrupt lock, but it is quite
         # likely to succeed if we are *not* in the first thread that
         # acquired the garbage collector lock.
-        #
-        # Note: don't try to allocate in the local arena first because
-        # we have already acquired a device-wide lock. Allocating in
-        # the local arena first might waste precious time.
-        ptr2 = gc_malloc_local(master_record.global_arena, bytesize)
+        ptr2 = allocate()
 
         if ptr2 == C_NULL
             # We are either the first thread to acquire the interrupt lock
@@ -1035,7 +1043,7 @@ end
 const MiB = 1 << 20
 
 # The initial size of the GC heap, currently 20 MiB.
-const initial_gc_heap_size = 16 * MiB
+const initial_gc_heap_size = 20 * MiB
 
 # The default capacity of a root buffer, i.e., the max number of
 # roots that can be stored per thread. Currently set to
@@ -1055,8 +1063,8 @@ const global_arena_starvation_threshold = 4 * MiB
 # If a local arena's free byte count stays below the arena starvation
 # threshold after a collection phase, the collector will allocate
 # additional memory to the arena such that it is no longer starving.
-# The arena starvation threshold is currently set to 1 MiB.
-const local_arena_starvation_threshold = 1 * MiB
+# The arena starvation threshold is currently set to 2 MiB.
+const local_arena_starvation_threshold = 2 * MiB
 
 # The point at which a tiny arena is deemed to be starving, i.e.,
 # it no longer contains enough memory to perform basic allocations.
@@ -1141,7 +1149,7 @@ function gc_init!(
     end
 
     # Set up the global arena.
-    global_arena = make_gc_arena!(BodegaArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
+    global_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
 
     return GCMasterRecord(
         warp_count,

From bdd6c0b3a6a4210aff9a7fa31ad15b1501adc28f Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 20 Mar 2019 10:49:31 +0100
Subject: [PATCH 067/146] Create a linked list example

---
 examples/linked-list.jl | 78 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 examples/linked-list.jl

diff --git a/examples/linked-list.jl b/examples/linked-list.jl
new file mode 100644
index 00000000..b0eb958a
--- /dev/null
+++ b/examples/linked-list.jl
@@ -0,0 +1,78 @@
+using CUDAnative, CUDAdrv
+using Test
+import Base: foldl, reduce, sum
+
+# This test constructs a linked list in a GPU kernel.
+
+use_gc = true
+
+abstract type List{T}
+end
+
+mutable struct Nil{T} <: List{T}
+end
+
+mutable struct Cons{T} <: List{T}
+    value::T
+    next::List{T}
+end
+
+Cons{T}(value::T) where T = Cons{T}(value, Nil{T}())
+
+function List{T}(pointer, count::Integer) where T
+    result = Nil{T}()
+    for i in count:-1:1
+        result = Cons{T}(unsafe_load(pointer, i), result)
+    end
+    result
+end
+
+function foldl(op, list::List{T}; init) where T
+    node = list
+    accumulator = init
+    while isa(node, Cons{T})
+        accumulator = op(accumulator, node.value)
+        node = node.next
+    end
+    accumulator
+end
+
+function reduce(op, list::List{T}; init) where T
+    foldl(op, list; init=init)
+end
+
+function sum(list::List{T}) where T
+    reduce(+, list; init=zero(T))
+end
+
+const element_count = 200
+const thread_count = 256
+
+function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    l = List{Int64}(elements, element_count)
+    unsafe_store!(results, sum(l), i)
+    return
+end
+
+# Allocate two arrays.
+source_array = Mem.alloc(Int64, element_count)
+destination_array = Mem.alloc(Int64, thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+# Fill the source and destination arrays.
+Mem.upload!(source_array, Array(1:element_count))
+Mem.upload!(destination_array, zeros(Int64, thread_count))
+
+# Run the kernel.
+if use_gc
+    @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+    stats = @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+else
+    @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+    stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+end
+println(stats)
+
+@test Mem.download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count)

From 1b93aba6b406e3eaf87df3d34df4de490f0e6480 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 21 Mar 2019 12:55:09 +0100
Subject: [PATCH 068/146] Fix imperfect rebase

---
 src/CUDAnative.jl     |  4 +---
 src/compiler/optim.jl | 16 ++++++----------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index 85ea5ef9..8006ac8e 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -31,8 +31,6 @@ include(joinpath("device", "array.jl"))
 include(joinpath("device", "cuda.jl"))
 include(joinpath("device", "llvm.jl"))
 include(joinpath("device", "runtime.jl"))
-include(joinpath("device", "libdevice.jl"))
-include(joinpath("device", "cuda_intrinsics.jl"))
 include(joinpath("device", "threading.jl"))
 
 # The interrupts and GC files need to be loaded _before_ the
@@ -40,7 +38,7 @@ include(joinpath("device", "threading.jl"))
 # depend on the GC and the GC depends on interrupts.
 include("interrupts.jl")
 include("gc.jl")
-include(joinpath("device", "runtime_intrinsics.jl"))
+include(joinpath("device", "runtime.jl"))
 
 include("compiler.jl")
 include("execution.jl")
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 5ccf964a..cac90195 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -70,7 +70,12 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
 
         ModulePassManager() do pm
             initialize!(pm)
-            add!(pm, ModulePass("FinalLowerGCGPU", lower_final_gc_intrinsics!))
+            if ctx.gc
+                add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry)))
+                add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!))
+            else
+                add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!))
+            end
             aggressive_dce!(pm) # remove dead uses of ptls
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
@@ -86,15 +91,6 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
     # PTX-specific optimizations
     ModulePassManager() do pm
         initialize!(pm)
-        # lower intrinsics
-        if ctx.gc
-            add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry)))
-            add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!))
-        else
-            add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!))
-        end
-        aggressive_dce!(pm) # remove dead uses of ptls
-        add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
         # NVPTX's target machine info enables runtime unrolling,
         # but Julia's pass sequence only invokes the simple unroller.

From bb03af28f9dfc9e33016a44672feff0092b0b97a Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 21 Mar 2019 16:06:57 +0100
Subject: [PATCH 069/146] Add a StaticArrays-based GC example

---
 examples/matrix-static-arrays.jl | 40 ++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 examples/matrix-static-arrays.jl

diff --git a/examples/matrix-static-arrays.jl b/examples/matrix-static-arrays.jl
new file mode 100644
index 00000000..12e54e98
--- /dev/null
+++ b/examples/matrix-static-arrays.jl
@@ -0,0 +1,40 @@
+using StaticArrays, CUDAnative, CUDAdrv
+
+use_gc = false
+
+const matrix_dim = 40
+const iterations = 20
+const thread_count = 256
+
+function fill()
+    m = zeros(MMatrix{matrix_dim, matrix_dim, Int64})
+
+    for i in 1:matrix_dim
+        for j in 1:matrix_dim
+            m[i, j] = i * j
+        end
+    end
+
+    return m
+end
+
+function kernel(result::CUDAnative.DevicePtr{Int64})
+    thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    # Write the accumulator to the result array.
+    unsafe_store!(result, fill()[20, 30], thread_id)
+
+    return
+end
+
+destination_array = Mem.alloc(Int64, thread_count)
+destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+if use_gc
+    @cuda_gc threads=thread_count kernel(destination_pointer)
+    stats = @cuda_gc threads=thread_count kernel(destination_pointer)
+else
+    @cuda threads=thread_count kernel(destination_pointer)
+    stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer)
+end
+println(stats)

From 46614f3854f0b4811e7a49688f8092c823872004 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 13:10:58 +0200
Subject: [PATCH 070/146] Teach allocator to transfer memory block ownership

---
 src/device/threading.jl |  16 +++--
 src/gc.jl               | 141 ++++++++++++++++++++++++++++++----------
 2 files changed, 118 insertions(+), 39 deletions(-)

diff --git a/src/device/threading.jl b/src/device/threading.jl
index a7de7645..846db990 100644
--- a/src/device/threading.jl
+++ b/src/device/threading.jl
@@ -127,12 +127,16 @@ function warp_serialized(func::Function)
 end
 
 """
-    reader_locked(func::Function, lock::ReaderWriterLock)
+    reader_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true)
 
 Acquires a reader-writer lock in reader mode, runs `func` while the lock is
 acquired and releases the lock again.
 """
-function reader_locked(func::Function, lock::ReaderWriterLock)
+function reader_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true)
+    if !acquire_lock
+        return func()
+    end
+
     while true
         # Increment the reader count. If the lock is in write-acquired mode,
         # then the lock will stay in that mode (unless the reader count is
@@ -157,12 +161,16 @@ function reader_locked(func::Function, lock::ReaderWriterLock)
 end
 
 """
-    writer_locked(func::Function, lock::ReaderWriterLock)
+    writer_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true)
 
 Acquires a reader-writer lock in writer mode, runs `func` while the lock is
 acquired and releases the lock again.
 """
-function writer_locked(func::Function, lock::ReaderWriterLock)
+function writer_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true)
+    if !acquire_lock
+        return func()
+    end
+
     warp_serialized() do
         # Try to move the lock from 'idle' to 'write-acquired'.
         while atomic_compare_exchange!(lock.state_ptr, 0, -max_rw_lock_readers) != 0
diff --git a/src/gc.jl b/src/gc.jl
index 742559d0..4df493a6 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -266,6 +266,9 @@ function get_free_list(arena::Ptr{BodegaArena})::Ptr{FreeListArena}
     @get_field_pointer(arena, :free_list)
 end
 
+# Gets a bodega arena's lock.
+get_lock(arena::Ptr{BodegaArena}) = get_lock(get_free_list(arena))
+
 # Gets the first shelf containing chunks that are at least `bytesize` bytes
 # in size. Returns null if there is no such shelf.
 function get_shelf(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{BodegaShelf}
@@ -377,11 +380,11 @@ end
 @inline function get_local_arena()::Ptr{BodegaArena}
     master_record = get_gc_master_record()
     if master_record.local_arena_count == UInt32(0)
-        return C_NULL
+        return Base.unsafe_convert(Ptr{BodegaArena}, C_NULL)
     else
         return unsafe_load(
             master_record.local_arenas,
-            get_warp_id() % master_record.local_arena_count)
+            ((get_warp_id() - 1) % master_record.local_arena_count) + 1)
     end
 end
 
@@ -483,13 +486,13 @@ macro perma_safepoint(expr)
     end
 end
 
-# Tries to use a free-list entry to allocate a chunk of data of size `bytesize`.
-# Updates the free list if the allocation succeeds. Returns a null pointer otherwise.
-function gc_use_free_list_entry(
+# Tries to use a free-list entry to allocate a chunk of data of size `bytesize`,
+# producing an appropriately-sized free list entry that prefixes the data. This
+# entry is removed from the free list but not yet added to the allocation list.
+function gc_take_list_entry(
     entry_ptr::Ptr{Ptr{FreeListRecord}},
-    allocation_list_ptr::Ptr{Ptr{FreeListRecord}},
     entry::Ptr{FreeListRecord},
-    bytesize::Csize_t)::Ptr{UInt8}
+    bytesize::Csize_t)::Ptr{FreeListRecord}
 
     entry_data = unsafe_load(entry)
     if entry_data.size < bytesize
@@ -540,19 +543,21 @@ function gc_use_free_list_entry(
         unsafe_store!(entry_ptr, entry_data.next)
     end
 
-    # At this point, all we need to do is update the allocation record to
-    # reflect the fact that it now represents an allocated block instead of
-    # a free block.
+    return entry
+end
+
+# Prepends a free list record to a free list.
+function gc_add_to_free_list(
+    entry::Ptr{FreeListRecord},
+    list_ptr::Ptr{Ptr{FreeListRecord}})
 
     # Set the `next` pointer to the value stored at the allocation list pointer.
     unsafe_store!(
         @get_field_pointer(entry, :next)::Ptr{Ptr{FreeListRecord}},
-        unsafe_load(allocation_list_ptr))
+        unsafe_load(list_ptr))
 
     # Update the allocation list pointer to point to the entry.
-    unsafe_store!(allocation_list_ptr, entry)
-
-    return data_address
+    unsafe_store!(list_ptr, entry)
 end
 
 # Tries to allocate a chunk of memory from a ScatterAlloc page.
@@ -733,24 +738,22 @@ end
 # Tries to allocate a chunk of memory from a free list.
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
-#
-# `free_list_ptr` is a pointer to the head of the free list.
-# `allocation_list_ptr` is a pointer to the head of the allocation list.
-#
-# This function is not thread-safe.
-function gc_malloc_from_free_list(
+# If the result is non-null, then a free list record is
+# returned that has been taken from the free list but not
+# yet added to another list.
+function gc_take_any_list_entry(
     free_list_ptr::Ptr{Ptr{FreeListRecord}},
-    allocation_list_ptr::Ptr{Ptr{FreeListRecord}},
-    bytesize::Csize_t)::Ptr{UInt8}
+    bytesize::Csize_t)::Ptr{FreeListRecord}
+
     # To allocate memory, we will walk the free list until we find a suitable candidate.
-    while free_list_ptr != C_NULL
+    while true
         free_list_item = unsafe_load(free_list_ptr)
 
         if free_list_item == C_NULL
             break
         end
 
-        result = gc_use_free_list_entry(free_list_ptr, allocation_list_ptr, free_list_item, bytesize)
+        result = gc_take_list_entry(free_list_ptr, free_list_item, bytesize)
         if result != C_NULL
             return result
         end
@@ -768,7 +771,20 @@ end
 function gc_malloc_from_free_list(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8}
     free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}}
     allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}}
-    gc_malloc_from_free_list(free_list_ptr, allocation_list_ptr, bytesize)
+
+    # Try to take the entry out of the free list.
+    result_entry = gc_take_any_list_entry(free_list_ptr, bytesize)
+    if result_entry == C_NULL
+        # The entry is just too small. Return a `null` pointer.
+        return C_NULL
+    end
+
+    # At this point, all we need to do is update the allocation record to
+    # reflect the fact that it now represents an allocated block instead of
+    # a free block.
+    gc_add_to_free_list(result_entry, allocation_list_ptr)
+
+    return data_pointer(result_entry)
 end
 
 # Writes a pointer to a temporary GC frame. This will keep the pointer
@@ -784,9 +800,9 @@ end
 # Tries to allocate a chunk of memory in a particular GC arena.
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
-function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8}
+function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t; acquire_lock=true)::Ptr{UInt8}
     # Acquire the arena's lock.
-    result_ptr = writer_locked(get_lock(arena)) do
+    result_ptr = writer_locked(get_lock(arena); acquire_lock=acquire_lock) do
         # Allocate a suitable region of memory.
         gc_malloc_from_free_list(arena, bytesize)
     end
@@ -858,7 +874,7 @@ end
 # Tries to allocate a chunk of memory in a particular GC arena.
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
-function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{UInt8}
+function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t; acquire_lock=true)::Ptr{UInt8}
     # The bodega arena might be empty (or approximately empty). If so, then we'll
     # just return null early. There's no need to scrape the bottom of the barrel.
     if !unsafe_load(@get_field_pointer(arena, :can_restock))
@@ -877,14 +893,14 @@ function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{UInt8}
     # Acquire a reader lock on the arena and try to take a chunk
     # from the shelf.
     lock = get_lock(free_list)
-    result_ptr = reader_locked(lock) do
+    result_ptr = reader_locked(lock; acquire_lock=acquire_lock) do
         gc_malloc_from_shelf(shelf)
     end
 
     if result_ptr == C_NULL
         # Looks like we need to re-stock the shelf. While we're at it,
         # we might as well grab a chunk of memory for ourselves.
-        result_ptr = writer_locked(lock) do
+        result_ptr = writer_locked(lock; acquire_lock=acquire_lock) do
             restock_shelf(arena, shelf)
             gc_malloc_from_free_list(free_list, bytesize)
         end
@@ -894,6 +910,48 @@ function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{UInt8}
     return result_ptr
 end
 
+# Transfers a block of free memory from one arena to another and then
+# allocates a differently-sized block of memory from the destination
+# arena.
+function gc_transfer_and_malloc(
+    from_arena::Ptr{FreeListArena},
+    to_arena::Ptr{FreeListArena},
+    transfer_bytesize::Csize_t,
+    alloc_bytesize::Csize_t)::Ptr{UInt8}
+
+    from_free_list = @get_field_pointer(from_arena, :free_list_head)::Ptr{Ptr{FreeListRecord}}
+    entry = writer_locked(get_lock(from_arena)) do
+        # Try to take the entry out of the free list.
+        gc_take_any_list_entry(from_free_list, transfer_bytesize)
+    end
+
+    if entry == C_NULL
+        return C_NULL
+    else
+        to_free_list = @get_field_pointer(to_arena, :free_list_head)::Ptr{Ptr{FreeListRecord}}
+        return writer_locked(get_lock(to_arena)) do
+            gc_add_to_free_list(entry, to_free_list)
+            gc_malloc_local(to_arena, alloc_bytesize; acquire_lock=false)
+        end
+    end
+end
+
+# Transfers a block of free memory from one arena to another and then
+# allocates a differently-sized block of memory from the destination
+# arena.
+function gc_transfer_and_malloc(
+    from_arena::Ptr{FreeListArena},
+    to_arena::Ptr{BodegaArena},
+    transfer_bytesize::Csize_t,
+    alloc_bytesize::Csize_t)::Ptr{UInt8}
+
+    gc_transfer_and_malloc(
+        from_arena,
+        get_free_list(to_arena),
+        transfer_bytesize,
+        alloc_bytesize)
+end
+
 """
     gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
 
@@ -924,16 +982,29 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
             if local_ptr != C_NULL
                 return local_ptr
             end
+        else
+            # If there is no local arena then we will just have to allocate
+            # from the global arena directly.
+            return gc_malloc_local(master_record.global_arena, bytesize)
         end
 
         # Try to use the global arena if all else fails, but only if the chunk
         # of memory we want to allocate is sufficiently large. Allocating lots of
         # small chunks in the global arena will result in undue contention and slow
         # down kernels dramatically.
-        if bytesize >= 1024
+        #
+        # If we need to allocate a small chunk of memory but the local arena is
+        # empty, then we will transfer a *much* larger chunk of memory from the global
+        # arena to the local arena. After that we'll allocate in the local arena.
+        min_global_alloc_size = Csize_t(256 * (1 << 10))
+        if bytesize >= min_global_alloc_size
             local_ptr = gc_malloc_local(master_record.global_arena, bytesize)
         else
-            local_ptr = C_NULL
+            local_ptr = gc_transfer_and_malloc(
+                master_record.global_arena,
+                local_arena,
+                min_global_alloc_size,
+                bytesize)
         end
         return local_ptr
     end
@@ -1042,8 +1113,8 @@ end
 # One megabyte.
 const MiB = 1 << 20
 
-# The initial size of the GC heap, currently 20 MiB.
-const initial_gc_heap_size = 20 * MiB
+# The initial size of the GC heap, currently 16 MiB.
+const initial_gc_heap_size = 16 * MiB
 
 # The default capacity of a root buffer, i.e., the max number of
 # roots that can be stored per thread. Currently set to
@@ -1064,7 +1135,7 @@ const global_arena_starvation_threshold = 4 * MiB
 # threshold after a collection phase, the collector will allocate
 # additional memory to the arena such that it is no longer starving.
 # The arena starvation threshold is currently set to 2 MiB.
-const local_arena_starvation_threshold = 2 * MiB
+const local_arena_starvation_threshold = 1 * MiB
 
 # The point at which a tiny arena is deemed to be starving, i.e.,
 # it no longer contains enough memory to perform basic allocations.

From d60114b053f205e8026b1fc759cb10f63268b00d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 13:11:54 +0200
Subject: [PATCH 071/146] Update examples

---
 examples/linked-list.jl | 4 ++--
 examples/matrix.jl      | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/linked-list.jl b/examples/linked-list.jl
index b0eb958a..2c7e949c 100644
--- a/examples/linked-list.jl
+++ b/examples/linked-list.jl
@@ -45,8 +45,8 @@ function sum(list::List{T}) where T
     reduce(+, list; init=zero(T))
 end
 
-const element_count = 200
-const thread_count = 256
+const element_count = 1000
+const thread_count = 32
 
 function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64})
     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
diff --git a/examples/matrix.jl b/examples/matrix.jl
index a787171f..277aacd1 100644
--- a/examples/matrix.jl
+++ b/examples/matrix.jl
@@ -5,7 +5,6 @@
 
 using StaticArrays, CUDAnative, CUDAdrv
 import Base: getindex, setindex!, pointer, unsafe_convert, zeros
-using InteractiveUtils
 
 const use_gc = true
 

From 82208c87ff94f0f1910dad24a46c67a6ad283f4e Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 13:13:01 +0200
Subject: [PATCH 072/146] Introduce benchmarking utilities

---
 examples/matrix-static-arrays.jl | 24 +++++--------
 examples/utils.jl                | 60 ++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 15 deletions(-)
 create mode 100644 examples/utils.jl

diff --git a/examples/matrix-static-arrays.jl b/examples/matrix-static-arrays.jl
index 12e54e98..5e174bf5 100644
--- a/examples/matrix-static-arrays.jl
+++ b/examples/matrix-static-arrays.jl
@@ -1,9 +1,6 @@
-using StaticArrays, CUDAnative, CUDAdrv
-
-use_gc = false
+using StaticArrays, CUDAnative, CUDAdrv, BenchmarkTools
 
 const matrix_dim = 40
-const iterations = 20
 const thread_count = 256
 
 function fill()
@@ -20,21 +17,18 @@ end
 
 function kernel(result::CUDAnative.DevicePtr{Int64})
     thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    # Write the accumulator to the result array.
     unsafe_store!(result, fill()[20, 30], thread_id)
-
     return
 end
 
-destination_array = Mem.alloc(Int64, thread_count)
-destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+include("utils.jl")
 
-if use_gc
-    @cuda_gc threads=thread_count kernel(destination_pointer)
-    stats = @cuda_gc threads=thread_count kernel(destination_pointer)
-else
-    @cuda threads=thread_count kernel(destination_pointer)
-    stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer)
+function benchmark()
+    destination_array = Mem.alloc(Int64, thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+    @cuda_sync threads=thread_count kernel(destination_pointer)
 end
+
+stats = @cuda_benchmark benchmark()
+println(length(stats))
 println(stats)
diff --git a/examples/utils.jl b/examples/utils.jl
new file mode 100644
index 00000000..253b8622
--- /dev/null
+++ b/examples/utils.jl
@@ -0,0 +1,60 @@
+use_gc = true
+
+"""
+    device_reset!(dev::CuDevice=device())
+
+Reset the CUDA state associated with a device. This call with release the underlying
+context, at which point any objects allocated in that context will be invalidated.
+"""
+function device_reset!(dev::CuDevice=CUDAdrv.device())
+    if haskey(CUDAnative.device_contexts, dev)
+        # take the context out of the pool, and finalize it to trigger release
+        old_ctx = CUDAnative.device_contexts[dev]
+        delete!(CUDAnative.device_contexts, dev)
+        finalize(old_ctx)
+
+        # unless the user switches devices, new API calls should trigger initialization
+        CUDAdrv.apicall_hook[] = CUDAnative.maybe_initialize
+        CUDAnative.initialized[] = false
+        # HACK: the context changes, but CuCurrentContext() _can_ actually return a handle
+        # with the same pointer value... this bypasses the compile cache, and crashes
+        empty!(CUDAnative.compilecache)
+    end
+end
+
+"""
+    @sync ex
+Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly
+synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As
+such, this operation is preferred over implicit synchronization (e.g. when performing a
+memory copy) for high-performance applications.
+It is also useful for timing code that executes asynchronously.
+"""
+macro sync(ex)
+    # Copied from https://github.com/JuliaGPU/CuArrays.jl/blob/8e45a27f2b12796f47683340845f98f017865676/src/utils.jl#L68-L86
+    quote
+        local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING)
+        local ret = $(esc(ex))
+        CUDAdrv.record(e)
+        CUDAdrv.synchronize(e)
+        ret
+    end
+end
+
+macro cuda_sync(args...)
+    if use_gc
+        esc(quote
+            CUDAnative.@cuda_gc $(args...)
+        end)
+    else
+        esc(quote
+            @sync CUDAnative.@cuda $(args...)
+        end)
+    end
+end
+
+macro cuda_benchmark(ex)
+    esc(quote
+        @benchmark $(ex) teardown=(device_reset!()) evals=1
+    end)
+end

From b18a0bb49a2c101ade25d1bc32efa84d9bf463bf Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 13:51:10 +0200
Subject: [PATCH 073/146] Fix some typos

---
 src/CUDAnative.jl     | 1 -
 src/compiler/optim.jl | 2 +-
 src/execution.jl      | 4 ++--
 src/gc.jl             | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index 8006ac8e..0a040a87 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -30,7 +30,6 @@ include(joinpath("device", "pointer.jl"))
 include(joinpath("device", "array.jl"))
 include(joinpath("device", "cuda.jl"))
 include(joinpath("device", "llvm.jl"))
-include(joinpath("device", "runtime.jl"))
 include(joinpath("device", "threading.jl"))
 
 # The interrupts and GC files need to be loaded _before_ the
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index cac90195..cf5e8da3 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -70,7 +70,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
 
         ModulePassManager() do pm
             initialize!(pm)
-            if ctx.gc
+            if job.gc
                 add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry)))
                 add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!))
             else
diff --git a/src/execution.jl b/src/execution.jl
index ecfbefe5..f8b902e6 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -450,7 +450,7 @@ functionality is included in [`@cuda`](@ref).
 The 'init' keyword argument is a function that takes a kernel as argument and
 sets up an environment for the kernel.
 """
-function prepare_kernel(kernel::Kernel{F,TT}; init::Function=nop_init_kernel) where {F,TT}
+function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel) where {F,TT}
     # Just call the 'init' function for now.
     init(kernel)
 end
@@ -514,7 +514,7 @@ function nearest_warpsize(dev::CuDevice, threads::Integer)
     return threads + (ws - threads % ws) % ws
 end
 
-function nop_init_kernel(kernel::Kernel{F,TT}) where {F,TT}
+function nop_init_kernel(kernel::AbstractKernel{F,TT}) where {F,TT}
     # Do nothing.
     return
 end
\ No newline at end of file
diff --git a/src/gc.jl b/src/gc.jl
index 4df493a6..9da1ae5b 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -1795,7 +1795,7 @@ macro cuda_gc(ex...)
     args = call.args[2:end]
 
     code = quote end
-    compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs)
+    env_kwargs, compiler_kwargs, call_kwargs = CUDAnative.split_kwargs(kwargs)
     vars, var_exprs = CUDAnative.assign_args!(code, args)
 
     # Find the stream on which the kernel is to be scheduled.

From 11042c4f248c1edf945ebb83dcf4642da18ea395 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 14:27:31 +0200
Subject: [PATCH 074/146] Update '@cuda_interruptible'

---
 src/interrupts.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interrupts.jl b/src/interrupts.jl
index 83fe13d5..de7cc7cb 100644
--- a/src/interrupts.jl
+++ b/src/interrupts.jl
@@ -237,7 +237,7 @@ macro cuda_interruptible(handler, ex...)
     args = call.args[2:end]
 
     code = quote end
-    compiler_kwargs, call_kwargs, env_kwargs = CUDAnative.split_kwargs(kwargs)
+    env_kwargs, compiler_kwargs, call_kwargs = CUDAnative.split_kwargs(kwargs)
     vars, var_exprs = CUDAnative.assign_args!(code, args)
 
     # Find the stream on which the kernel is to be scheduled.

From b183aafce8fce2ca5a59901a6344472eb3082259 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 14:29:17 +0200
Subject: [PATCH 075/146] Don't try to include deleted intrinsics test file

---
 test/runtests.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 6cac0eb5..11738b64 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -69,7 +69,6 @@ if CUDAnative.configured
             include("device/pointer.jl")
             include("device/array.jl")
             include("device/cuda.jl")
-            include("device/intrinsics.jl")
             include("device/threading.jl")
             include("device/gc.jl")
 

From 0402ad870482fade6d1251c882bc7c631723e95f Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 15:25:12 +0200
Subject: [PATCH 076/146] Switch back to free lists for local arenas

---
 src/gc.jl | 51 ++++++++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 9da1ae5b..e56bbbc8 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -303,6 +303,9 @@ const GCFrame = Ptr{ObjectRef}
     in_perma_safepoint = 2
 end
 
+const LocalArena = FreeListArena
+const GlobalArena = FreeListArena
+
 # A data structure that contains global GC info. This data
 # structure is designed to be immutable: it should not be changed
 # once the host has set it up.
@@ -325,10 +328,10 @@ struct GCMasterRecord
     tiny_arena::Ptr{ScatterAllocArena}
 
     # A pointer to a list of local GC arena pointers.
-    local_arenas::Ptr{Ptr{BodegaArena}}
+    local_arenas::Ptr{Ptr{LocalArena}}
 
     # A pointer to the global GC arena.
-    global_arena::Ptr{FreeListArena}
+    global_arena::Ptr{GlobalArena}
 
     # A pointer to a list of safepoint flags. Every warp has its
     # own flag.
@@ -377,10 +380,10 @@ end
 
 # Gets a pointer to the local arena for this thread. This
 # pointer may be null if there are no local arenas.
-@inline function get_local_arena()::Ptr{BodegaArena}
+@inline function get_local_arena()::Ptr{LocalArena}
     master_record = get_gc_master_record()
     if master_record.local_arena_count == UInt32(0)
-        return Base.unsafe_convert(Ptr{BodegaArena}, C_NULL)
+        return Base.unsafe_convert(Ptr{LocalArena}, C_NULL)
     else
         return unsafe_load(
             master_record.local_arenas,
@@ -1183,8 +1186,8 @@ function gc_init!(
     gc_memory_end_ptr = master_region.start + master_region.size
 
     # Allocate a local arena pointer buffer.
-    local_arenas_bytesize = sizeof(Ptr{BodegaArena}) * local_arena_count
-    local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{BodegaArena}}, gc_memory_start_ptr)
+    local_arenas_bytesize = sizeof(Ptr{LocalArena}) * local_arena_count
+    local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{LocalArena}}, gc_memory_start_ptr)
 
     # Allocate the safepoint flag buffer.
     safepoint_bytesize = sizeof(SafepointState) * warp_count
@@ -1214,13 +1217,13 @@ function gc_init!(
 
     # Set up local arenas.
     for i in 1:local_arena_count
-        local_arena = make_gc_arena!(BodegaArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold))
+        local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold))
         unsafe_store!(local_arenas_ptr, local_arena, i)
         arena_start_ptr += local_arena_starvation_threshold
     end
 
     # Set up the global arena.
-    global_arena = make_gc_arena!(FreeListArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
+    global_arena = make_gc_arena!(GlobalArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
 
     return GCMasterRecord(
         warp_count,
@@ -1390,42 +1393,36 @@ function get_record(
     alloc_list::SortedAllocationList,
     pointer::Ptr{T})::Ptr{FreeListRecord} where T
 
-    cast_ptr = Base.unsafe_convert(Ptr{FreeListRecord}, pointer)
-
-    # Deal with the most common cases quickly.
+    # Deal with these cases quickly so we can assume that the
+    # free list is nonempty.
     if length(alloc_list) == 0 ||
         pointer < data_pointer(alloc_list.records[1]) ||
-        pointer > data_pointer(alloc_list.records[end]) + Base.unsafe_load(alloc_list.records[end]).size
+        pointer >= data_end_pointer(alloc_list.records[end])
 
         return C_NULL
     end
 
-    # To do this lookup quickly, we will do a binary search for the
-    # biggest allocation record pointer that is smaller than `pointer`.
+    # To quickly narrow down the search space, we will do a binary search
+    # for the biggest allocation record pointer that is smaller than `pointer`.
     range_start, range_end = 1, length(alloc_list)
-    while range_end - range_start > 1 
+    while range_end - range_start > 4
         range_mid = div(range_start + range_end, 2)
         mid_val = alloc_list.records[range_mid]
-        if mid_val > cast_ptr
+        if mid_val > pointer
             range_end = range_mid
         else
             range_start = range_mid
         end
     end
 
-    record = alloc_list.records[range_end]
-    if record >= cast_ptr
-        record = alloc_list.records[range_start]
-    end
-
     # Make sure that the pointer actually points to a region of memory
     # that is managed by the candidate record we found.
-    record_data_pointer = data_pointer(record)
-    if cast_ptr >= record_data_pointer && cast_ptr < record_data_pointer + unsafe_load(record).size
-        return record
-    else
-        return C_NULL
+    for record in alloc_list.records[range_start:range_end]
+        if pointer >= data_pointer(record) && pointer < data_end_pointer(record)
+            return record
+        end
     end
+    return C_NULL
 end
 
 # Iterates through a linked list of allocation records and apply a function
@@ -1561,7 +1558,7 @@ function gc_compact(arena::Ptr{FreeListArena})::Csize_t
     unsafe_store!(prev_record_ptr, C_NULL)
 
     # Compute the total number of free bytes.
-    return sum(record -> unsafe_load(record).size, records)
+    return sum(map(record -> unsafe_load(record).size, records))
 end
 
 # Compact a GC arena's free list. This function will

From b9029da15395aad3a517f3b661740cecbe176e80 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 16:20:16 +0200
Subject: [PATCH 077/146] Fix GC collection bug

---
 src/gc.jl | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index e56bbbc8..0993fd5b 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -1085,8 +1085,7 @@ function gc_free_local(
     free_list_head_ptr = @get_field_pointer(arena, :free_list_head)
 
     # Remove the record from the allocation list.
-    next_record = unsafe_load(next_record_ptr)
-    unsafe_store!(record_ptr, next_record)
+    unsafe_store!(record_ptr, unsafe_load(next_record_ptr))
 
     # Add the record to the free list and update its `next` pointer
     # (but not in that order).
@@ -1137,7 +1136,7 @@ const global_arena_starvation_threshold = 4 * MiB
 # If a local arena's free byte count stays below the arena starvation
 # threshold after a collection phase, the collector will allocate
 # additional memory to the arena such that it is no longer starving.
-# The arena starvation threshold is currently set to 2 MiB.
+# The arena starvation threshold is currently set to 1 MiB.
 const local_arena_starvation_threshold = 1 * MiB
 
 # The point at which a tiny arena is deemed to be starving, i.e.,
@@ -1704,10 +1703,9 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription,
                 # Mark the block as live.
                 push!(live_blocks, record)
                 # Add all pointer-sized, aligned values to the live pointer worklist.
-                block_pointer = data_pointer(record)
-                block_size = unsafe_load(record).size
-                for i in 0:sizeof(ObjectRef):(block_size - 1)
-                    push!(live_worklist, Base.unsafe_convert(ObjectRef, block_pointer + i))
+                for ptr in data_pointer(record):sizeof(ObjectRef):data_end_pointer(record) - 1
+                    value = unsafe_load(Base.unsafe_convert(Ptr{ObjectRef}, ptr))
+                    push!(live_worklist, value)
                 end
             end
         end

From 7e20c667bfb3dd87b3707f111f0520aef98f131b Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 16:27:49 +0200
Subject: [PATCH 078/146] Reduce initial GC heap size

---
 src/gc.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 0993fd5b..82f5c0e2 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -1115,8 +1115,8 @@ end
 # One megabyte.
 const MiB = 1 << 20
 
-# The initial size of the GC heap, currently 16 MiB.
-const initial_gc_heap_size = 16 * MiB
+# The initial size of the GC heap, currently 10 MiB.
+const initial_gc_heap_size = 10 * MiB
 
 # The default capacity of a root buffer, i.e., the max number of
 # roots that can be stored per thread. Currently set to

From 51ca870b1f2a1ac18e00417d41552e4a296719eb Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 16:31:54 +0200
Subject: [PATCH 079/146] Update benchmarking utilities

---
 examples/utils.jl | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/examples/utils.jl b/examples/utils.jl
index 253b8622..f8720a82 100644
--- a/examples/utils.jl
+++ b/examples/utils.jl
@@ -7,19 +7,17 @@ Reset the CUDA state associated with a device. This call with release the underl
 context, at which point any objects allocated in that context will be invalidated.
 """
 function device_reset!(dev::CuDevice=CUDAdrv.device())
-    if haskey(CUDAnative.device_contexts, dev)
-        # take the context out of the pool, and finalize it to trigger release
-        old_ctx = CUDAnative.device_contexts[dev]
-        delete!(CUDAnative.device_contexts, dev)
-        finalize(old_ctx)
-
-        # unless the user switches devices, new API calls should trigger initialization
-        CUDAdrv.apicall_hook[] = CUDAnative.maybe_initialize
-        CUDAnative.initialized[] = false
-        # HACK: the context changes, but CuCurrentContext() _can_ actually return a handle
-        # with the same pointer value... this bypasses the compile cache, and crashes
-        empty!(CUDAnative.compilecache)
-    end
+    delete!(CUDAnative.device_contexts, dev)
+
+    pctx = CuPrimaryContext(dev)
+    unsafe_reset!(pctx)
+
+    # unless the user switches devices, new API calls should trigger initialization
+    CUDAdrv.apicall_hook[] = CUDAnative.maybe_initialize
+    CUDAnative.initialized[] = false
+
+    # HACK: primary contexts always have the same handle, defeating the compilation cache
+    empty!(CUDAnative.compilecache)
 end
 
 """
@@ -55,6 +53,6 @@ end
 
 macro cuda_benchmark(ex)
     esc(quote
-        @benchmark $(ex) teardown=(device_reset!()) evals=1
+        @benchmark $(ex) setup=($(ex)) teardown=(device_reset!()) evals=1
     end)
 end

From 4390d90556bad854820471c73522f9ad8048c4b1 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 3 Apr 2019 16:46:20 +0200
Subject: [PATCH 080/146] Put GC benchmarks in a separate directory

---
 gc-benchmarks/binary-tree.jl                  | 165 ++++++++++++++++++
 gc-benchmarks/linked-list.jl                  |  81 +++++++++
 .../matrix-static-arrays.jl                   |  14 +-
 {examples => gc-benchmarks}/utils.jl          |   6 +-
 4 files changed, 259 insertions(+), 7 deletions(-)
 create mode 100644 gc-benchmarks/binary-tree.jl
 create mode 100644 gc-benchmarks/linked-list.jl
 rename {examples => gc-benchmarks}/matrix-static-arrays.jl (74%)
 rename {examples => gc-benchmarks}/utils.jl (90%)

diff --git a/gc-benchmarks/binary-tree.jl b/gc-benchmarks/binary-tree.jl
new file mode 100644
index 00000000..b5a76629
--- /dev/null
+++ b/gc-benchmarks/binary-tree.jl
@@ -0,0 +1,165 @@
+using CUDAdrv, CUDAnative
+using Random, Test
+import Base: haskey, insert!
+
+include("utils.jl")
+
+# This benchmark defines a kernel that constructs a binary search
+# tree for a set of numbers and then proceeds to test membership
+# in that tree for a sequence of other numbers.
+#
+# The benchmark is designed to stress the allocator's ability to
+# allocate many small objects and garbage-collect the ones that
+# become dead after a while.
+
+"""A binary search tree node."""
+abstract type BinarySearchTreeNode{T} end
+
+"""An internal node of a binary search tree."""
+mutable struct InternalNode{T} <: BinarySearchTreeNode{T}
+    value::T
+    left::BinarySearchTreeNode{T}
+    right::BinarySearchTreeNode{T}
+end
+
+InternalNode{T}(value::T) where T = InternalNode{T}(value, LeafNode{T}(), LeafNode{T}())
+
+"""A leaf node of a binary search tree."""
+mutable struct LeafNode{T} <: BinarySearchTreeNode{T} end
+
+"""A binary search tree data structure."""
+mutable struct BinarySearchTree{T}
+    root::BinarySearchTreeNode{T}
+end
+
+"""Creates an empty binary search tree."""
+BinarySearchTree{T}() where T = BinarySearchTree{T}(LeafNode{T}())
+
+"""Tells if a binary search tree contains a particular element."""
+function haskey(tree::BinarySearchTree{T}, value::T)::Bool where T
+    walk = tree.root
+    while isa(walk, InternalNode{T})
+        if walk.value == value
+            return true
+        elseif walk.value > value
+            walk = walk.right
+        else
+            walk = walk.left
+        end
+    end
+    return false
+end
+
+"""Inserts an element into a binary search tree."""
+function insert!(tree::BinarySearchTree{T}, value::T) where T
+    if !isa(tree.root, InternalNode{T})
+        tree.root = InternalNode{T}(value)
+        return
+    end
+
+    walk = tree.root::InternalNode{T}
+    while true
+        if walk.value == value
+            return
+        elseif walk.value > value
+            right = walk.right
+            if isa(right, InternalNode{T})
+                walk = right
+            else
+                walk.right = InternalNode{T}(value)
+                return
+            end
+        else
+            left = walk.left
+            if isa(left, InternalNode{T})
+                walk = left
+            else
+                walk.left = InternalNode{T}(value)
+                return
+            end
+        end
+    end
+end
+
+"""
+Creates a binary search tree that contains elements copied from a device array.
+"""
+function BinarySearchTree{T}(elements::CUDAnative.DevicePtr{T}, size::Integer) where T
+    tree = BinarySearchTree{T}()
+    for i in 1:size
+        insert!(tree, unsafe_load(elements, i))
+    end
+    tree
+end
+
+"""
+Creates a binary search tree that contains elements copied from an array.
+"""
+function BinarySearchTree{T}(elements::Array{T}) where T
+    tree = BinarySearchTree{T}()
+    for i in 1:length(elements)
+        insert!(tree, elements[i])
+    end
+    tree
+end
+
+# Gets a sequence of Fibonacci numbers.
+function fibonacci(::Type{T}, count::Integer)::Array{T} where T
+    if count == 0
+        return []
+    elseif count == 1
+        return [one(T)]
+    end
+
+    results = [one(T), one(T)]
+    for i in 1:(count - 2)
+        push!(results, results[length(results) - 1] + results[length(results)])
+    end
+    return results
+end
+
+const number_count = 200
+const thread_count = 64
+const tests_per_thread = 2000
+
+# Define a kernel that copies values using a temporary buffer.
+function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    tree = BinarySearchTree{Int64}(a, number_count)
+
+    for j in 1:tests_per_thread
+        offset = (i - 1) * tests_per_thread
+        index = offset + j
+        unsafe_store!(b, haskey(tree, unsafe_load(b, index)), index)
+    end
+
+    return
+end
+
+function benchmark()
+    # Generate a sequence of 64-bit truncated Fibonacci numbers.
+    number_set = fibonacci(Int64, number_count)
+    # Randomize the sequence's order.
+    shuffle!(number_set)
+
+    # Generate numbers for which we will test membership in the sequence.
+    test_sequence = Array(1:(thread_count * tests_per_thread))
+
+    # Allocate two arrays.
+    source_array = Mem.alloc(Int64, length(number_set))
+    destination_array = Mem.alloc(Int64, length(test_sequence))
+    source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+    # Fill the source and destination arrays.
+    Mem.upload!(source_array, number_set)
+    Mem.upload!(destination_array, test_sequence)
+
+    # Run the kernel.
+    @cuda_sync threads=thread_count kernel(source_pointer, destination_pointer)
+
+    @test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence])
+end
+
+@cuda_benchmark benchmark()
diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl
new file mode 100644
index 00000000..a8e0c616
--- /dev/null
+++ b/gc-benchmarks/linked-list.jl
@@ -0,0 +1,81 @@
+using CUDAnative, CUDAdrv, BenchmarkTools
+using Test
+
+include("utils.jl")
+
+import Base: foldl, reduce, sum
+
+# This benchmark constructs a linked list in a GPU kernel.
+# In doing so, it stresses the allocator's ability to quickly
+# allocate many small objects, as is common in idiomatic
+# object-oriented programs.
+# Thread divergence should be minimal in this benchmark.
+
+abstract type List{T}
+end
+
+mutable struct Nil{T} <: List{T}
+end
+
+mutable struct Cons{T} <: List{T}
+    value::T
+    next::List{T}
+end
+
+Cons{T}(value::T) where T = Cons{T}(value, Nil{T}())
+
+function List{T}(pointer, count::Integer) where T
+    result = Nil{T}()
+    for i in count:-1:1
+        result = Cons{T}(unsafe_load(pointer, i), result)
+    end
+    result
+end
+
+function foldl(op, list::List{T}; init) where T
+    node = list
+    accumulator = init
+    while isa(node, Cons{T})
+        accumulator = op(accumulator, node.value)
+        node = node.next
+    end
+    accumulator
+end
+
+function reduce(op, list::List{T}; init) where T
+    foldl(op, list; init=init)
+end
+
+function sum(list::List{T}) where T
+    reduce(+, list; init=zero(T))
+end
+
+const element_count = 1000
+const thread_count = 32
+
+function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    l = List{Int64}(elements, element_count)
+    unsafe_store!(results, sum(l), i)
+    return
+end
+
+function benchmark()
+    # Allocate two arrays.
+    source_array = Mem.alloc(Int64, element_count)
+    destination_array = Mem.alloc(Int64, thread_count)
+    source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+    # Fill the source and destination arrays.
+    Mem.upload!(source_array, Array(1:element_count))
+    Mem.upload!(destination_array, zeros(Int64, thread_count))
+
+    # Run the kernel.
+    @cuda_sync threads=thread_count kernel(source_pointer, destination_pointer)
+
+    # Verify the kernel's output.
+    @test Mem.download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count)
+end
+
+@cuda_benchmark benchmark()
diff --git a/examples/matrix-static-arrays.jl b/gc-benchmarks/matrix-static-arrays.jl
similarity index 74%
rename from examples/matrix-static-arrays.jl
rename to gc-benchmarks/matrix-static-arrays.jl
index 5e174bf5..43002099 100644
--- a/examples/matrix-static-arrays.jl
+++ b/gc-benchmarks/matrix-static-arrays.jl
@@ -1,4 +1,10 @@
-using StaticArrays, CUDAnative, CUDAdrv, BenchmarkTools
+using StaticArrays, CUDAnative, CUDAdrv
+
+include("utils.jl")
+
+# This benchmark makes every thread allocate a large matrix.
+# It stresses the allocator's ability to quickly allocate
+# a small number of very large objects.
 
 const matrix_dim = 40
 const thread_count = 256
@@ -21,14 +27,10 @@ function kernel(result::CUDAnative.DevicePtr{Int64})
     return
 end
 
-include("utils.jl")
-
 function benchmark()
     destination_array = Mem.alloc(Int64, thread_count)
     destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
     @cuda_sync threads=thread_count kernel(destination_pointer)
 end
 
-stats = @cuda_benchmark benchmark()
-println(length(stats))
-println(stats)
+@cuda_benchmark benchmark()
diff --git a/examples/utils.jl b/gc-benchmarks/utils.jl
similarity index 90%
rename from examples/utils.jl
rename to gc-benchmarks/utils.jl
index f8720a82..dfb289a0 100644
--- a/examples/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -1,3 +1,5 @@
+import BenchmarkTools
+
 use_gc = true
 
 """
@@ -53,6 +55,8 @@ end
 
 macro cuda_benchmark(ex)
     esc(quote
-        @benchmark $(ex) setup=($(ex)) teardown=(device_reset!()) evals=1
+        local stats = BenchmarkTools.@benchmark $(ex) setup=($(ex)) teardown=(device_reset!()) evals=1
+        println(length(stats))
+        println(stats)
     end)
 end

From a3fb2903976f5f237d47fc1d87712b759163833c Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 4 Apr 2019 13:26:45 +0200
Subject: [PATCH 081/146] Rename linked list benchmark import

---
 gc-benchmarks/linked-list.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl
index a8e0c616..ae8e3fcc 100644
--- a/gc-benchmarks/linked-list.jl
+++ b/gc-benchmarks/linked-list.jl
@@ -1,4 +1,4 @@
-using CUDAnative, CUDAdrv, BenchmarkTools
+using CUDAnative, CUDAdrv
 using Test
 
 include("utils.jl")

From 79ea2a1df301d218c44255cf00473c3c2814805d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 4 Apr 2019 13:27:25 +0200
Subject: [PATCH 082/146] Rename matrix GC benchmark

---
 gc-benchmarks/{matrix-static-arrays.jl => matrix.jl} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename gc-benchmarks/{matrix-static-arrays.jl => matrix.jl} (100%)

diff --git a/gc-benchmarks/matrix-static-arrays.jl b/gc-benchmarks/matrix.jl
similarity index 100%
rename from gc-benchmarks/matrix-static-arrays.jl
rename to gc-benchmarks/matrix.jl

From f2dbf3f3ed548749f4db45a86c8f047b4f5a4f5a Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 4 Apr 2019 14:22:42 +0200
Subject: [PATCH 083/146] Set the malloc heap size when running benchmarks

---
 gc-benchmarks/utils.jl | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index dfb289a0..868ccbb7 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -2,6 +2,18 @@ import BenchmarkTools
 
 use_gc = true
 
+const MiB = 1 << 20
+const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02
+const BENCHMARK_HEAP_SIZE = 64 * MiB
+
+function set_malloc_heap_size(size::Integer)
+    CUDAdrv.@apicall(
+        :cuCtxSetLimit,
+        (Cint, Csize_t),
+        CU_LIMIT_MALLOC_HEAP_SIZE,
+        Csize_t(size))
+end
+
 """
     device_reset!(dev::CuDevice=device())
 
@@ -55,7 +67,7 @@ end
 
 macro cuda_benchmark(ex)
     esc(quote
-        local stats = BenchmarkTools.@benchmark $(ex) setup=($(ex)) teardown=(device_reset!()) evals=1
+        local stats = BenchmarkTools.@benchmark $(ex) setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $(ex)) teardown=(device_reset!()) evals=1
         println(length(stats))
         println(stats)
     end)

From b57b9022a1cf1dd2fbc96fd9bd2dffd591587ef9 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 4 Apr 2019 14:22:54 +0200
Subject: [PATCH 084/146] Add an array benchmark

---
 gc-benchmarks/arrays.jl | 51 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 gc-benchmarks/arrays.jl

diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl
new file mode 100644
index 00000000..160824ce
--- /dev/null
+++ b/gc-benchmarks/arrays.jl
@@ -0,0 +1,51 @@
+using CUDAdrv, CUDAnative, StaticArrays
+
+include("utils.jl")
+
+# This benchmark allocates a variety of differently-sized arrays.
+# The point of this benchmark is to ascertain how well the GC handles
+# many differently-sized objects.
+
+const thread_count = 64
+
+@noinline function escape(value)
+    Base.pointer_from_objref(value)
+    value
+end
+
+macro new_array(T, size)
+    quote
+        escape(zeros(MArray{Tuple{$size}, $T}))
+    end
+end
+
+function kernel()
+    for i in 1:2
+        for j in 1:2
+            for k in 1:2
+                for l in 1:2
+                    @new_array(Int64, 4)
+                    @new_array(Int64, 8)
+                    @new_array(Int64, 16)
+                end
+                @new_array(Int64, 32)
+                @new_array(Int64, 64)
+                @new_array(Int64, 128)
+            end
+            @new_array(Int64, 256)
+            @new_array(Int64, 512)
+            @new_array(Int64, 1024)
+        end
+        @new_array(Int64, 2048)
+        @new_array(Int64, 4096)
+        @new_array(Int64, 8192)
+    end
+    return
+end
+
+function benchmark()
+    # Run the kernel.
+    @cuda_sync threads=thread_count kernel()
+end
+
+@cuda_benchmark benchmark()

From 89d8bbc96b74bc596655879524c4550ea51abe71 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 4 Apr 2019 14:28:23 +0200
Subject: [PATCH 085/146] Reuse 'device_reset!' in benchmarking utils

---
 gc-benchmarks/utils.jl | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 868ccbb7..5982e3c4 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -14,26 +14,6 @@ function set_malloc_heap_size(size::Integer)
         Csize_t(size))
 end
 
-"""
-    device_reset!(dev::CuDevice=device())
-
-Reset the CUDA state associated with a device. This call with release the underlying
-context, at which point any objects allocated in that context will be invalidated.
-"""
-function device_reset!(dev::CuDevice=CUDAdrv.device())
-    delete!(CUDAnative.device_contexts, dev)
-
-    pctx = CuPrimaryContext(dev)
-    unsafe_reset!(pctx)
-
-    # unless the user switches devices, new API calls should trigger initialization
-    CUDAdrv.apicall_hook[] = CUDAnative.maybe_initialize
-    CUDAnative.initialized[] = false
-
-    # HACK: primary contexts always have the same handle, defeating the compilation cache
-    empty!(CUDAnative.compilecache)
-end
-
 """
     @sync ex
 Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly

From fac07ef8b4b2ecea753e8a474af4daa1a81db679 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 9 Apr 2019 15:41:06 +0200
Subject: [PATCH 086/146] Create a GC benchmark driver

---
 gc-benchmarks/arrays.jl      | 12 +++++-----
 gc-benchmarks/binary-tree.jl | 19 +++++++++-------
 gc-benchmarks/linked-list.jl | 23 ++++++++++---------
 gc-benchmarks/matrix.jl      | 27 ++++++++++++++--------
 gc-benchmarks/run-all.jl     | 10 +++++++++
 gc-benchmarks/utils.jl       | 43 ++++++++++++++++++++++++++----------
 6 files changed, 89 insertions(+), 45 deletions(-)
 create mode 100644 gc-benchmarks/run-all.jl

diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl
index 160824ce..49326cb6 100644
--- a/gc-benchmarks/arrays.jl
+++ b/gc-benchmarks/arrays.jl
@@ -1,6 +1,6 @@
-using CUDAdrv, CUDAnative, StaticArrays
+module Arrays
 
-include("utils.jl")
+using CUDAdrv, CUDAnative, StaticArrays
 
 # This benchmark allocates a variety of differently-sized arrays.
 # The point of this benchmark is to ascertain how well the GC handles
@@ -43,9 +43,11 @@ function kernel()
     return
 end
 
-function benchmark()
+end
+
+function arrays_benchmark()
     # Run the kernel.
-    @cuda_sync threads=thread_count kernel()
+    @cuda_sync threads=Arrays.thread_count Arrays.kernel()
 end
 
-@cuda_benchmark benchmark()
+@cuda_benchmark "arrays" arrays_benchmark()
diff --git a/gc-benchmarks/binary-tree.jl b/gc-benchmarks/binary-tree.jl
index b5a76629..99d98afc 100644
--- a/gc-benchmarks/binary-tree.jl
+++ b/gc-benchmarks/binary-tree.jl
@@ -1,8 +1,9 @@
-using CUDAdrv, CUDAnative
 using Random, Test
-import Base: haskey, insert!
 
-include("utils.jl")
+module BinaryTree
+
+using CUDAdrv, CUDAnative
+import Base: haskey, insert!
 
 # This benchmark defines a kernel that constructs a binary search
 # tree for a set of numbers and then proceeds to test membership
@@ -137,14 +138,16 @@ function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64})
     return
 end
 
-function benchmark()
+end
+
+function bintree_benchmark()
     # Generate a sequence of 64-bit truncated Fibonacci numbers.
-    number_set = fibonacci(Int64, number_count)
+    number_set = BinaryTree.fibonacci(Int64, BinaryTree.number_count)
     # Randomize the sequence's order.
     shuffle!(number_set)
 
     # Generate numbers for which we will test membership in the sequence.
-    test_sequence = Array(1:(thread_count * tests_per_thread))
+    test_sequence = Array(1:(BinaryTree.thread_count * BinaryTree.tests_per_thread))
 
     # Allocate two arrays.
     source_array = Mem.alloc(Int64, length(number_set))
@@ -157,9 +160,9 @@ function benchmark()
     Mem.upload!(destination_array, test_sequence)
 
     # Run the kernel.
-    @cuda_sync threads=thread_count kernel(source_pointer, destination_pointer)
+    @cuda_sync threads=BinaryTree.thread_count BinaryTree.kernel(source_pointer, destination_pointer)
 
     @test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence])
 end
 
-@cuda_benchmark benchmark()
+@cuda_benchmark "binary-tree" bintree_benchmark()
diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl
index ae8e3fcc..84f76fc5 100644
--- a/gc-benchmarks/linked-list.jl
+++ b/gc-benchmarks/linked-list.jl
@@ -1,8 +1,7 @@
+module LinkedList
+
 using CUDAnative, CUDAdrv
 using Test
-
-include("utils.jl")
-
 import Base: foldl, reduce, sum
 
 # This benchmark constructs a linked list in a GPU kernel.
@@ -60,22 +59,24 @@ function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.Devic
     return
 end
 
-function benchmark()
+end
+
+function linkedlist_benchmark()
     # Allocate two arrays.
-    source_array = Mem.alloc(Int64, element_count)
-    destination_array = Mem.alloc(Int64, thread_count)
+    source_array = Mem.alloc(Int64, LinkedList.element_count)
+    destination_array = Mem.alloc(Int64, LinkedList.thread_count)
     source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
     destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
 
     # Fill the source and destination arrays.
-    Mem.upload!(source_array, Array(1:element_count))
-    Mem.upload!(destination_array, zeros(Int64, thread_count))
+    Mem.upload!(source_array, Array(1:LinkedList.element_count))
+    Mem.upload!(destination_array, zeros(Int64, LinkedList.thread_count))
 
     # Run the kernel.
-    @cuda_sync threads=thread_count kernel(source_pointer, destination_pointer)
+    @cuda_sync threads=LinkedList.thread_count LinkedList.kernel(source_pointer, destination_pointer)
 
     # Verify the kernel's output.
-    @test Mem.download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count)
+    @test Mem.download(Int64, destination_array, LinkedList.thread_count) == repeat([sum(1:LinkedList.element_count)], LinkedList.thread_count)
 end
 
-@cuda_benchmark benchmark()
+@cuda_benchmark "linked-list" linkedlist_benchmark()
diff --git a/gc-benchmarks/matrix.jl b/gc-benchmarks/matrix.jl
index 43002099..fa772e8e 100644
--- a/gc-benchmarks/matrix.jl
+++ b/gc-benchmarks/matrix.jl
@@ -1,14 +1,19 @@
-using StaticArrays, CUDAnative, CUDAdrv
+module Matrix
 
-include("utils.jl")
+using StaticArrays, CUDAnative, CUDAdrv
 
 # This benchmark makes every thread allocate a large matrix.
 # It stresses the allocator's ability to quickly allocate
-# a small number of very large objects.
+# very large objects.
 
 const matrix_dim = 40
 const thread_count = 256
 
+@noinline function escape(value)
+    Base.pointer_from_objref(value)
+    value
+end
+
 function fill()
     m = zeros(MMatrix{matrix_dim, matrix_dim, Int64})
 
@@ -18,19 +23,23 @@ function fill()
         end
     end
 
-    return m
+    return escape(m)
 end
 
 function kernel(result::CUDAnative.DevicePtr{Int64})
     thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    unsafe_store!(result, fill()[20, 30], thread_id)
+    for i in 1:6
+        unsafe_store!(result, fill()[20, 30], thread_id)
+    end
     return
 end
 
-function benchmark()
-    destination_array = Mem.alloc(Int64, thread_count)
+end
+
+function matrix_benchmark()
+    destination_array = Mem.alloc(Int64, Matrix.thread_count)
     destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
-    @cuda_sync threads=thread_count kernel(destination_pointer)
+    @cuda_sync threads=Matrix.thread_count Matrix.kernel(destination_pointer)
 end
 
-@cuda_benchmark benchmark()
+@cuda_benchmark "matrix" matrix_benchmark()
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
new file mode 100644
index 00000000..13050498
--- /dev/null
+++ b/gc-benchmarks/run-all.jl
@@ -0,0 +1,10 @@
+using CUDAdrv, CUDAnative
+
+include("utils.jl")
+
+include("arrays.jl")
+include("binary-tree.jl")
+include("linked-list.jl")
+include("matrix.jl")
+
+println(run_benchmarks())
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 5982e3c4..e8d21900 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -1,6 +1,12 @@
 import BenchmarkTools
 
-use_gc = true
+function should_use_gc()
+    try
+        return use_gc
+    catch ex
+        return true
+    end
+end
 
 const MiB = 1 << 20
 const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02
@@ -34,21 +40,34 @@ macro sync(ex)
 end
 
 macro cuda_sync(args...)
-    if use_gc
-        esc(quote
+    esc(quote
+        if should_use_gc()
             CUDAnative.@cuda_gc $(args...)
-        end)
-    else
-        esc(quote
+        else
             @sync CUDAnative.@cuda $(args...)
-        end)
-    end
+        end
+    end)
 end
 
-macro cuda_benchmark(ex)
+suite = BenchmarkTools.BenchmarkGroup()
+
+function register_cuda_benchmark(f, name)
+    suite[name] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1
+end
+
+macro cuda_benchmark(name, ex)
     esc(quote
-        local stats = BenchmarkTools.@benchmark $(ex) setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $(ex)) teardown=(device_reset!()) evals=1
-        println(length(stats))
-        println(stats)
+        register_cuda_benchmark($name * "-gc") do
+            global use_gc = true
+            $(ex)
+        end
+        register_cuda_benchmark($name * "-nogc") do
+            global use_gc = false
+            $(ex)
+        end
     end)
 end
+
+function run_benchmarks()
+    BenchmarkTools.run(suite)
+end

From f8d4edeb28ab537ec78f60a4dd3cb27423b451d5 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 9 Apr 2019 16:23:52 +0200
Subject: [PATCH 087/146] Include an SSA IR optimization benchmark

---
 gc-benchmarks/run-all.jl |   1 +
 gc-benchmarks/ssa-opt.jl | 100 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 gc-benchmarks/ssa-opt.jl

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 13050498..256fbe38 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -6,5 +6,6 @@ include("arrays.jl")
 include("binary-tree.jl")
 include("linked-list.jl")
 include("matrix.jl")
+include("ssa-opt.jl")
 
 println(run_benchmarks())
diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl
new file mode 100644
index 00000000..3f8c3a39
--- /dev/null
+++ b/gc-benchmarks/ssa-opt.jl
@@ -0,0 +1,100 @@
+# This benchmark defines a sea-of-nodes SSA IR, creates a basic
+# block on the GPU and applies the constant folding optimization
+# to it.
+
+module SSAOpt
+
+# A base type for SSA instructions.
+abstract type Instruction end
+
+# A base type for values or flow in an SSA basic block.
+abstract type ValueOrFlow end
+
+# A value in an SSA control-flow graph.
+mutable struct Value <: ValueOrFlow
+    # The instruction that computes the value.
+    instruction::Instruction
+
+    # The next value or control-flow instruction.
+    next::ValueOrFlow
+end
+
+# A base type for control-flow instructions in an SSA basic block.
+abstract type Flow <: ValueOrFlow end
+
+# A control-flow instruction that returns a value.
+mutable struct ReturnFlow <: Flow
+    # The value to return.
+    result::Value
+end
+
+# A control-flow instruction that represents undefined control flow.
+mutable struct UndefinedFlow <: Flow end
+
+# A basic block in an SSA control-flow graph.
+mutable struct BasicBlock
+    # The first value or flow instruction in the basic block.
+    head::ValueOrFlow
+end
+
+# An integer constant instruction.
+mutable struct IConst <: Instruction
+    value::Int
+end
+
+# An integer addition instruction.
+mutable struct IAdd <: Instruction
+    # The left value.
+    left::Value
+    # The right value.
+    right::Value
+end
+
+# Folds constants in a basic block.
+function fold_constants(block::BasicBlock)
+    value = block.head
+    while isa(value, Value)
+        insn = value.instruction
+        if isa(insn, IAdd)
+            left = insn.left.instruction
+            right = insn.right.instruction
+            if isa(left, IConst)
+                if isa(right, IConst)
+                    value.instruction = IConst(left.value + right.value)
+                end
+            end
+        end
+        value = value.next
+    end
+    block
+end
+
+# Creates a block that naively computes `sum(1:range_max)`.
+function create_range_sum_block(range_max)
+    head = accumulator = Value(IConst(0), UndefinedFlow())
+    for i in 1:range_max
+        constant = Value(IConst(i), UndefinedFlow())
+        accumulator.next = constant
+        accumulator = Value(IAdd(accumulator, constant), UndefinedFlow())
+        constant.next = accumulator
+    end
+    ret_flow = ReturnFlow(accumulator)
+    accumulator.next = ret_flow
+    BasicBlock(head)
+end
+
+const thread_count = 256
+
+function kernel()
+    block = create_range_sum_block(50)
+    fold_constants(block)
+    return
+end
+
+end
+
+function ssaopt_benchmark()
+    @cuda_sync threads=SSAOpt.thread_count SSAOpt.kernel()
+end
+
+@cuda_benchmark "ssa-opt" ssaopt_benchmark()

From 67ac9de09a33e1f43984a01cfa55d87ea6c570a2 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 9 Apr 2019 16:34:37 +0200
Subject: [PATCH 088/146] Tweak ssa-opt benchmark comment

---
 gc-benchmarks/ssa-opt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl
index 3f8c3a39..e499e543 100644
--- a/gc-benchmarks/ssa-opt.jl
+++ b/gc-benchmarks/ssa-opt.jl
@@ -1,4 +1,4 @@
-# This benchmark defines a sea-of-nodes SSA IR, creates a basic
+# This benchmark defines a simple SSA IR, creates a basic
 # block on the GPU and applies the constant folding optimization
 # to it.
 

From 727a9281dc1113d5c059c032e1cca87735b71d8d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 9 Apr 2019 17:15:27 +0200
Subject: [PATCH 089/146] Write benchmark results to a CSV

---
 gc-benchmarks/binary-tree.jl |  2 +-
 gc-benchmarks/linked-list.jl |  2 +-
 gc-benchmarks/run-all.jl     | 17 ++++++++++++++++-
 gc-benchmarks/ssa-opt.jl     |  2 +-
 gc-benchmarks/utils.jl       |  9 +++++----
 5 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/gc-benchmarks/binary-tree.jl b/gc-benchmarks/binary-tree.jl
index 99d98afc..e7b3e46d 100644
--- a/gc-benchmarks/binary-tree.jl
+++ b/gc-benchmarks/binary-tree.jl
@@ -165,4 +165,4 @@ function bintree_benchmark()
     @test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence])
 end
 
-@cuda_benchmark "binary-tree" bintree_benchmark()
+@cuda_benchmark "binary tree" bintree_benchmark()
diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl
index 84f76fc5..762bf6de 100644
--- a/gc-benchmarks/linked-list.jl
+++ b/gc-benchmarks/linked-list.jl
@@ -79,4 +79,4 @@ function linkedlist_benchmark()
     @test Mem.download(Int64, destination_array, LinkedList.thread_count) == repeat([sum(1:LinkedList.element_count)], LinkedList.thread_count)
 end
 
-@cuda_benchmark "linked-list" linkedlist_benchmark()
+@cuda_benchmark "linked list" linkedlist_benchmark()
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 256fbe38..ca4ab50b 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -8,4 +8,19 @@ include("linked-list.jl")
 include("matrix.jl")
 include("ssa-opt.jl")
 
-println(run_benchmarks())
+results = run_benchmarks()
+# Print the results to the terminal.
+println(results)
+
+# Also write them to a CSV for further analysis.
+open("results.csv", "w") do file
+    write(file, "benchmark,nogc,gc,ratio\n")
+    for key in sort([k for k in keys(results)])
+        runs = results[key]
+        median_times = BenchmarkTools.median(runs)
+        gc_time = median_times["gc"].time / 1e6
+        nogc_time = median_times["nogc"].time / 1e6
+        ratio = gc_time / nogc_time
+        write(file, "$key,$nogc_time,$gc_time,$ratio\n")
+    end
+end
diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl
index e499e543..b7c10238 100644
--- a/gc-benchmarks/ssa-opt.jl
+++ b/gc-benchmarks/ssa-opt.jl
@@ -97,4 +97,4 @@ function ssaopt_benchmark()
     @cuda_sync threads=SSAOpt.thread_count SSAOpt.kernel()
 end
 
-@cuda_benchmark "ssa-opt" ssaopt_benchmark()
+@cuda_benchmark "ssa opt" ssaopt_benchmark()
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index e8d21900..5623bb02 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -51,17 +51,18 @@ end
 
 suite = BenchmarkTools.BenchmarkGroup()
 
-function register_cuda_benchmark(f, name)
-    suite[name] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1
+function register_cuda_benchmark(f, name, config)
+    suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90
 end
 
 macro cuda_benchmark(name, ex)
     esc(quote
-        register_cuda_benchmark($name * "-gc") do
+        suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "nogc"])
+        register_cuda_benchmark($name, "gc") do
             global use_gc = true
             $(ex)
         end
-        register_cuda_benchmark($name * "-nogc") do
+        register_cuda_benchmark($name, "nogc") do
             global use_gc = false
             $(ex)
         end

From fabdea9e7009c1c5aa9cfcc28d8a6da0513900e8 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 10 Apr 2019 15:48:46 +0200
Subject: [PATCH 090/146] Add two additional GC benchmarks

---
 gc-benchmarks/genetic-algorithm.jl | 179 +++++++++++++++++++++++++++++
 gc-benchmarks/linked-list.jl       |  41 ++++++-
 gc-benchmarks/run-all.jl           |   4 +-
 gc-benchmarks/stream-queries.jl    |  31 +++++
 4 files changed, 252 insertions(+), 3 deletions(-)
 create mode 100644 gc-benchmarks/genetic-algorithm.jl
 create mode 100644 gc-benchmarks/stream-queries.jl

diff --git a/gc-benchmarks/genetic-algorithm.jl b/gc-benchmarks/genetic-algorithm.jl
new file mode 100644
index 00000000..69a7fcac
--- /dev/null
+++ b/gc-benchmarks/genetic-algorithm.jl
@@ -0,0 +1,179 @@
+module GeneticAlgorithm
+
+# This benchmark runs a genetic algorithm on the GPU.
+# The population is stored in linked lists and characters
+# are stored in heap memory.
+
+using CUDAnative, CUDAdrv
+import ..LinkedList: List, Nil, Cons, foldl, map, max
+
+# A character in our genetic algorithm, based loosely on Fallout's SPECIAL system.
+mutable struct Character
+    strength::Int
+    perception::Int
+    endurance::Int
+    charisma::Int
+    intelligence::Int
+    agility::Int
+    luck::Int
+end
+
+# A linear congruential pseudo-random number generator.
+mutable struct LinearCongruentialGenerator
+    modulus::Int
+    a::Int
+    c::Int
+    state::Int
+end
+
+LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed)
+
+# Requests a pseudo-random number.
+function next(generator::LinearCongruentialGenerator)::Int
+    generator.state = (generator.a * generator.state + generator.c) % generator.modulus
+    generator.state
+end
+
+# Requests a pseudo-random number that is at least as great as `lower`
+# and less than `upper`.
+function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int
+    lower + next(generator) % (upper - lower)
+end
+
+# Computes the mean of two integers.
+function mean(a::Int, b::Int)::Int
+    div(a + b, 2)
+end
+
+function crossover(parent_one::Character, parent_two::Character)::Character
+    Character(
+        mean(parent_one.strength, parent_two.strength),
+        mean(parent_one.perception, parent_two.perception),
+        mean(parent_one.endurance, parent_two.endurance),
+        mean(parent_one.charisma, parent_two.charisma),
+        mean(parent_one.intelligence, parent_two.intelligence),
+        mean(parent_one.agility, parent_two.agility),
+        mean(parent_one.luck, parent_two.luck))
+end
+
+function mutate_stat(value::Int, generator::LinearCongruentialGenerator)::Int
+    new_stat = value + next(generator, -2, 3)
+    if new_stat > 10
+        return 10
+    elseif new_stat < 0
+        return 0
+    else
+        return new_stat
+    end
+end
+
+function mutate(original::Character, generator::LinearCongruentialGenerator)::Character
+    Character(
+        mutate_stat(original.strength, generator),
+        mutate_stat(original.perception, generator),
+        mutate_stat(original.endurance, generator),
+        mutate_stat(original.charisma, generator),
+        mutate_stat(original.intelligence, generator),
+        mutate_stat(original.agility, generator),
+        mutate_stat(original.luck, generator))
+end
+
+function random_character(generator::LinearCongruentialGenerator)::Character
+    Character(
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11))
+end
+
+# Computes the fitness of a character.
+function fitness(individual::Character)::Float64
+    # Compute the character's cost, i.e., the sum of their stats.
+    cost = Float64(individual.strength
+        + individual.perception
+        + individual.endurance
+        + individual.charisma
+        + individual.intelligence
+        + individual.agility
+        + individual.luck)
+
+    # Compute the character's true fitness, i.e., how well we expect
+    # the character to perform.
+    true_fitness = 0.0
+
+    function stat_fitness(stat::Int)::Float64
+        if stat >= 5
+            # Linear returns for stats greater than five.
+            return Float64(stat)
+        else
+            # Very low stats make for a poor character build.
+            return Float64(stat * stat) / 25.0
+        end
+    end
+
+    # Evaluate stats.
+    true_fitness += stat_fitness(individual.strength)
+    true_fitness += stat_fitness(individual.perception)
+    true_fitness += stat_fitness(individual.endurance)
+    true_fitness += stat_fitness(individual.charisma)
+    true_fitness += stat_fitness(individual.intelligence)
+    true_fitness += stat_fitness(individual.agility)
+    true_fitness += stat_fitness(individual.luck)
+
+    # We like charisma, intelligence and luck.
+    true_fitness += Float64(individual.charisma)
+    true_fitness += Float64(individual.intelligence)
+    true_fitness += Float64(individual.luck)
+
+    true_fitness - cost + 100.0
+end
+
+function fittest(population::List{Character})::Character
+    max(fitness, population, Character(0, 0, 0, 0, 0, 0, 0))
+end
+
+function step(population::List{Character}, generator::LinearCongruentialGenerator)::List{Character}
+    # Find the fittest individual in the population.
+    best = fittest(population)
+    # Do a bunch of crossovers and mutate the resulting population.
+    map(x -> mutate(crossover(best, x), generator), population)
+end
+
+function genetic_algo(seed::Int)::Character
+    generator = LinearCongruentialGenerator(seed)
+
+    # Generate some random characters.
+    individuals = Nil{Character}()
+    for j in 1:10
+        individuals = Cons{Character}(random_character(generator), individuals)
+    end
+
+    # Run the genetic algorithm for a few iterations.
+    for j in 1:10
+        individuals = step(individuals, generator)
+    end
+
+    # Find the best individual in the population.
+    fittest(individuals)
+end
+
+const thread_count = 256
+
+function kernel(results::CUDAnative.DevicePtr{Float64})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    fittest_individual = genetic_algo(i)
+    unsafe_store!(results, fitness(fittest_individual), i)
+end
+
+end
+
+function genetic_benchmark()
+    destination_array = Mem.alloc(Float64, GeneticAlgorithm.thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array)
+    @cuda_sync threads=GeneticAlgorithm.thread_count GeneticAlgorithm.kernel(destination_pointer)
+end
+
+@cuda_benchmark "genetic algo" genetic_benchmark()
diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl
index 762bf6de..64810ead 100644
--- a/gc-benchmarks/linked-list.jl
+++ b/gc-benchmarks/linked-list.jl
@@ -1,8 +1,7 @@
 module LinkedList
 
 using CUDAnative, CUDAdrv
-using Test
-import Base: foldl, reduce, sum
+import Base: foldl, reduce, sum, max, map, reverse, filter
 
 # This benchmark constructs a linked list in a GPU kernel.
 # In doing so, it stresses the allocator's ability to quickly
@@ -49,6 +48,44 @@ function sum(list::List{T}) where T
     reduce(+, list; init=zero(T))
 end
 
+function map_reverse(f::Function, list::List{T})::List{T} where T
+    foldl(list; init=Nil{T}()) do accumulator, value
+        Cons{T}(f(value), accumulator)
+    end
+end
+
+function reverse(list::List{T})::List{T} where T
+    map_reverse(x -> x, list)
+end
+
+function map(f::Function, list::List{T})::List{T} where T
+    reverse(map_reverse(f, list))
+end
+
+function max(evaluate::Function, list::List{T}, default_value::T)::T where T
+    foldl(list; init=default_value) do max_elem, elem
+        if evaluate(max_elem) < evaluate(elem)
+            elem
+        else
+            max_elem
+        end
+    end
+end
+
+function filter_reverse(f::Function, list::List{T})::List{T} where T
+    foldl(list; init=Nil{T}()) do accumulator, value
+        if f(value)
+            Cons{T}(value, accumulator)
+        else
+            accumulator
+        end
+    end
+end
+
+function filter(f::Function, list::List{T})::List{T} where T
+    reverse(filter_reverse(f, list))
+end
+
 const element_count = 1000
 const thread_count = 32
 
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index ca4ab50b..71d8f4fc 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -1,4 +1,4 @@
-using CUDAdrv, CUDAnative
+using CUDAdrv, CUDAnative, Test
 
 include("utils.jl")
 
@@ -7,6 +7,8 @@ include("binary-tree.jl")
 include("linked-list.jl")
 include("matrix.jl")
 include("ssa-opt.jl")
+include("stream-queries.jl")
+include("genetic-algorithm.jl")
 
 results = run_benchmarks()
 # Print the results to the terminal.
diff --git a/gc-benchmarks/stream-queries.jl b/gc-benchmarks/stream-queries.jl
new file mode 100644
index 00000000..0c992d5f
--- /dev/null
+++ b/gc-benchmarks/stream-queries.jl
@@ -0,0 +1,31 @@
+module StreamQueries
+
+using CUDAnative, CUDAdrv
+import ..LinkedList: List, Nil, Cons, foldl, map, max, filter
+
+# This benchmark applies stream operators (map, max,filter) to purely
+# functional lists.
+
+const thread_count = 256
+const input_size = 100
+
+function kernel(input::CUDAnative.DevicePtr{Float64}, output::CUDAnative.DevicePtr{Float64})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    values = List{Float64}(input, input_size)
+    values = map(x -> x * x, values)
+    values = filter(x -> x < 10.0 && x >= 0.0, values)
+    unsafe_store!(output, max(x -> x, values, 0.0), i)
+end
+
+end
+
+function stream_benchmark()
+    source_array = Mem.alloc(Float64, StreamQueries.input_size)
+    Mem.upload!(source_array, rand(Float64, StreamQueries.input_size))
+    destination_array = Mem.alloc(Float64, StreamQueries.thread_count)
+    source_pointer = Base.unsafe_convert(CuPtr{Float64}, source_array)
+    destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array)
+    @cuda_sync threads=StreamQueries.thread_count StreamQueries.kernel(source_pointer, destination_pointer)
+end
+
+@cuda_benchmark "stream queries" stream_benchmark()

From 8dba84df444d0101230618bc94c46dee4e6ef481 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 15 Apr 2019 16:44:54 +0200
Subject: [PATCH 091/146] Support creating one-dimensional arrays

---
 examples/gpu-array.jl |  18 +++
 src/compiler/optim.jl | 281 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 290 insertions(+), 9 deletions(-)
 create mode 100644 examples/gpu-array.jl

diff --git a/examples/gpu-array.jl b/examples/gpu-array.jl
new file mode 100644
index 00000000..ce97b4cc
--- /dev/null
+++ b/examples/gpu-array.jl
@@ -0,0 +1,18 @@
+using CUDAdrv, CUDAnative, StaticArrays, InteractiveUtils
+
+# This example allocates an array in a GPU kernel.
+
+const thread_count = 64
+
+@noinline function escape(value)
+    Base.pointer_from_objref(value)
+    value
+end
+
+function kernel()
+    array = [1, 2, 3, 4, 5, 6, 7]
+    escape(array)
+    return
+end
+
+@cuda_gc threads=thread_count kernel()
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index cf5e8da3..3dfae4d3 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -73,6 +73,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
             if job.gc
                 add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry)))
                 add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!))
+                add!(pm, FunctionPass("LowerArrays", lower_array_calls!))
             else
                 add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!))
             end
@@ -518,6 +519,45 @@ function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module)
     return changed
 end
 
+# Emits instructions that allocate a particular number of bytes
+# of GC-managed memory. No headroom is included. No tags are set.
+function new_bytes!(builder::LLVM.Builder, size)
+    call!(builder, Runtime.get(:gc_malloc_object), [size])
+end
+
+# Emits instructions that allocate bytes for an object, including
+# headroom for the object's tag. Also fills in the object's tag if
+# one is provided.
+function new_object!(builder::LLVM.Builder, size, tag::Union{Type, Nothing} = nothing)
+    # We need to reserve a single pointer of headroom for the tag.
+    # (LateLowerGCFrame depends on us doing that.)
+    headroom = Runtime.tag_size
+
+    # Call the allocation function and bump the resulting pointer
+    # so the headroom sits just in front of the returned pointer.
+    total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext()))
+    obj_ptr = new_bytes!(builder, total_size)
+
+    jl_value_t = llvmtype(obj_ptr)
+    T_bitcast = LLVM.PointerType(jl_value_t, LLVM.addrspace(jl_value_t))
+
+    ptr = bitcast!(builder, obj_ptr, T_bitcast)
+    if tag != nothing
+        # Fill in the tag if we have one.
+        store!(
+            builder,
+            inttoptr!(
+                builder,
+                ConstantInt(
+                    convert(LLVMType, Int64),
+                    Int64(pointer_from_objref(tag))),
+                jl_value_t),
+            ptr)
+    end
+    bumped_ptr = gep!(builder, ptr, [ConstantInt(Int32(1), JuliaContext())])
+    return bitcast!(builder, bumped_ptr, jl_value_t)
+end
+
 """
 lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
 
@@ -533,10 +573,6 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
     # store for an object, including headroom, but does not set the object's
     # tag.
     visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes
-        gc_alloc_bytes_ft = eltype(llvmtype(gc_alloc_bytes))::LLVM.FunctionType
-        T_ret = return_type(gc_alloc_bytes_ft)::LLVM.PointerType
-        T_bitcast = LLVM.PointerType(T_ret, LLVM.addrspace(T_ret))
-
         # Decode the call.
         ops = collect(operands(call))
         size = ops[2]
@@ -549,11 +585,7 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
         # so the headroom sits just in front of the returned pointer.
         let builder = Builder(JuliaContext())
             position!(builder, call)
-            total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext()))
-            ptr = call!(builder, Runtime.get(:gc_malloc_object), [total_size])
-            cast_ptr = bitcast!(builder, ptr, T_bitcast)
-            bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())])
-            result_ptr = bitcast!(builder, bumped_ptr, T_ret)
+            result_ptr = new_object!(builder, size)
             replace_uses!(call, result_ptr)
             unsafe_delete!(LLVM.parent(call), call)
             dispose(builder)
@@ -709,6 +741,237 @@ function insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function)
     return true
 end
 
+# Tries to evaluate an LLVM IR constant as a literal pointer.
+function to_literal_pointer(value)::Tuple{Bool, Ptr{Cvoid}}
+    if !isa(value, LLVM.ConstantExpr)
+        return (false, C_NULL)
+    end
+
+    if !occursin("inttoptr", string(value))
+        return (false, C_NULL)
+    end
+
+    # Peel off addrspacecast and inttoptr.
+    ptr_arg = value
+    while occursin("addrspacecast", string(ptr_arg)) || occursin("inttoptr", string(ptr_arg))
+        ptr_arg = first(operands(ptr_arg))
+    end
+    ptr_val = convert(Int, ptr_arg)
+    (true, Ptr{Cvoid}(ptr_val))
+end
+
+# Visits all calls to literal pointers in a function.
+function visit_literal_pointer_calls(visit_call::Function, fun::LLVM.Function)
+    for block in blocks(fun)
+        for call in instructions(block)
+            if !isa(call, LLVM.CallInst)
+                continue
+            end
+
+            callee = called_value(call)
+            if !isa(callee, LLVM.ConstantExpr)
+                continue
+            end
+
+            # detect calls to literal pointers
+            # FIXME: can we detect these properly?
+            # FIXME: jl_apply_generic and jl_invoke also have such arguments
+            is_ptr, ptr = to_literal_pointer(callee)
+            if is_ptr
+                # look it up in the Julia JIT cache
+                frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0)
+                if length(frames) >= 1
+                    # @compiler_assert length(frames) == 1 job frames=frames
+                    fn, file, line, linfo, fromC, inlined, ip = last(frames)
+                    visit_call(call, fn)
+                end
+            end
+        end
+    end
+end
+
+# Emits instructions that create a new array. The array's element type
+# must be statically known. Its dimensions are represented as a tuple
+# of LLVM IR values. A pointer to the new array is returned.
+function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple)
+    # Since time immemorial, the structure of an array is (quoting from the
+    # Julia source code here):
+    #
+    #     typedef struct {
+    #       /*
+    #         how - allocation style
+    #         0 = data is inlined, or a foreign pointer we don't manage
+    #         1 = julia-allocated buffer that needs to be marked
+    #         2 = malloc-allocated pointer this array object manages
+    #         3 = has a pointer to the object that owns the data
+    #       */
+    #       uint16_t how:2;
+    #       uint16_t ndims:10;
+    #       uint16_t pooled:1;
+    #       uint16_t ptrarray:1;  // representation is pointer array
+    #       uint16_t isshared:1;  // data is shared by multiple Arrays
+    #       uint16_t isaligned:1; // data allocated with memalign
+    #     } jl_array_flags_t;
+    #
+    #     JL_EXTENSION typedef struct {
+    #       JL_DATA_TYPE
+    #       void *data;
+    #     #ifdef STORE_ARRAY_LEN
+    #       size_t length;
+    #     #endif
+    #       jl_array_flags_t flags;
+    #       uint16_t elsize;
+    #       uint32_t offset;  // for 1-d only. does not need to get big.
+    #       size_t nrows;
+    #       union {
+    #           // 1d
+    #           size_t maxsize;
+    #           // Nd
+    #           size_t ncols;
+    #       };
+    #       // other dim sizes go here for ndims > 2
+    #
+    #       // followed by alignment padding and inline data, or owner pointer
+    #     } jl_array_t;
+    #
+    # where `STORE_ARRAY_LEN` is a preprocessor directive that is technically a
+    # "configuration option." AFAICT, `STORE_ARRAY_LEN` is just always defined in
+    # practice.
+    #
+    # The Julia compiler is more than happy to eagerly generate code that accesses
+    # fields of this data structure directly, so we can't invent our own array data
+    # structure. Consequently, we will emit code here that carefully constructs
+    # an instance of `jl_array_t`.
+    #
+    # To keep things tidy, we'll construct an array (ironic, I know) that contains the
+    # values we'll assign to each field of the array. After that, we will generate
+    # code that fills in every field in one fell swoop.
+
+    fields = []
+
+    # Compute the size of the element type.
+    element_type = eltype(array_type)
+    llvm_element_type = convert(LLVMType, element_type, true)
+    mod = LLVM.parent(LLVM.parent(position(builder)))
+    layout = datalayout(mod)
+    element_size = Csize_t(sizeof(layout, llvm_element_type))
+
+    # Compute the number of elements in the array.
+    element_count = LLVM.ConstantInt(convert(LLVMType, Csize_t), 1)
+    for i in dims
+        element_count = mul!(builder, element_count, intcast!(builder, i, convert(LLVMType, Csize_t)))
+    end
+
+    # Compute the size of the array's elements in bytes.
+    data_bytesize = mul!(
+        builder,
+        LLVM.ConstantInt(convert(LLVMType, Csize_t), element_size),
+        element_count)
+
+    # Actually allocate the array's contents. We will just always
+    # use a separate buffer. Inline data storage is wasteful and
+    # harder to implement.
+    data_ptr = new_bytes!(builder, data_bytesize)
+
+    # The pointer to the array's data is the first field of the struct.
+    push!(fields, data_ptr)
+
+    # The array's length (i.e., the product of its dimensions) is the
+    # second field of the `jl_array_t` struct.
+    push!(fields, element_count)
+
+    # Synthesize a constant that represents the array's flags.
+    flags = Int16(0)
+    # Set the 'how' field to one.
+    flags |= Int16(1)
+    # Set the 'nDims' field.
+    flags <<= 10
+    flags |= Int16(length(dims))
+    # Set the 'pooled' field to `false`.
+    flags <<= 1
+    flags |= Int16(false)
+    # Set the 'ptrarray' field.
+    flags <<= 1
+    flags |= Int16(isa(llvm_element_type, LLVM.PointerType))
+    # Set the 'isshared' field to `false`.
+    flags <<= 1
+    flags |= Int16(false)
+    # Set the 'isaligned' field to `true`.
+    flags <<= 1
+    flags |= Int16(true)
+    # Add the flags to the `jl_array_t` struct.
+    push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), flags))
+
+    # Set the 'offset' field to zero (the array is not a slice).
+    push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(0)))
+
+    if length(dims) == 1
+        # Set the 'nrows' field to the number of elements.
+        push!(fields, element_count)
+        # Ditto for the 'maxsize' field.
+        push!(fields, element_count)
+    else
+        # If we're creating a multi-dimensional array, then the
+        # process is slightly different.
+        for i in dims
+            push!(fields, intcast!(builder, i, convert(LLVMType, Csize_t)))
+        end
+    end
+
+    # Synthesize a struct type that neatly represents the data we want
+    # to store.
+    struct_type = LLVM.StructType([llvmtype(f) for f in fields])
+
+    # We now know exactly what data we want to store in each field of the
+    # array's control structure.
+    # All that's left is to actually allocate the array and write that data
+    # to the control structure.
+    obj_ptr = new_object!(
+        builder,
+        ConstantInt(convert(LLVMType, Csize_t), sizeof(layout, struct_type)),
+        array_type)
+    struct_ptr = bitcast!(
+        builder,
+        addrspacecast!(
+            builder,
+            obj_ptr,
+            LLVM.PointerType(eltype(llvmtype(obj_ptr)))),
+        LLVM.PointerType(struct_type))
+
+    for i in 1:length(fields)
+        val = fields[i]
+        gep = struct_gep!(builder, struct_ptr, i - 1)
+        store!(builder, val, gep)
+    end
+
+    return obj_ptr
+end
+
+# Lowers function calls that pertain to array operations.
+function lower_array_calls!(fun::LLVM.Function)
+    changed_any = false
+    visit_literal_pointer_calls(fun) do call, name
+        if name == :jl_alloc_array_1d
+            args = collect(operands(call))[1:end - 1]
+            is_ptr, array_type_ptr = to_literal_pointer(args[1])
+            if is_ptr
+                # We can lower array creation calls if we know the type
+                # of the array to create in advance.
+                array_type = unsafe_pointer_to_objref(array_type_ptr)
+                let builder = Builder(JuliaContext())
+                    position!(builder, call)
+                    new_array = new_array!(builder, array_type, (args[2],))
+                    replace_uses!(call, new_array)
+                    unsafe_delete!(LLVM.parent(call), call)
+                    dispose(builder)
+                end
+            end
+            changed_any = true
+        end
+    end
+    return changed_any
+end
+
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
 #
 # this assumes and checks that the TLS is unused, which should be the case for most GPU code

From 6c4450f0808a0cb1fe07d122ff12533393fa718f Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 15 Apr 2019 16:57:56 +0200
Subject: [PATCH 092/146] Rename "arrays" benchmark to "static-arrays"

---
 gc-benchmarks/run-all.jl                      | 2 +-
 gc-benchmarks/{arrays.jl => static-arrays.jl} | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename gc-benchmarks/{arrays.jl => static-arrays.jl} (89%)

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 71d8f4fc..e6382718 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -2,11 +2,11 @@ using CUDAdrv, CUDAnative, Test
 
 include("utils.jl")
 
-include("arrays.jl")
 include("binary-tree.jl")
 include("linked-list.jl")
 include("matrix.jl")
 include("ssa-opt.jl")
+include("static-arrays.jl")
 include("stream-queries.jl")
 include("genetic-algorithm.jl")
 
diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/static-arrays.jl
similarity index 89%
rename from gc-benchmarks/arrays.jl
rename to gc-benchmarks/static-arrays.jl
index 49326cb6..50c6a3ac 100644
--- a/gc-benchmarks/arrays.jl
+++ b/gc-benchmarks/static-arrays.jl
@@ -2,7 +2,7 @@ module Arrays
 
 using CUDAdrv, CUDAnative, StaticArrays
 
-# This benchmark allocates a variety of differently-sized arrays.
+# This benchmark allocates a variety of differently-sized static arrays.
 # The point of this benchmark is to ascertain how well the GC handles
 # many differently-sized objects.
 
@@ -50,4 +50,4 @@ function arrays_benchmark()
     @cuda_sync threads=Arrays.thread_count Arrays.kernel()
 end
 
-@cuda_benchmark "arrays" arrays_benchmark()
+@cuda_benchmark "static arrays" arrays_benchmark()

From d88313e7041a939f6cd75a485096f5462bb5ac05 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 15 Apr 2019 17:45:10 +0200
Subject: [PATCH 093/146] Support arrays in regular @cuda code

---
 src/compiler/optim.jl | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 3dfae4d3..a788a53b 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -73,10 +73,12 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
             if job.gc
                 add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry)))
                 add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!))
-                add!(pm, FunctionPass("LowerArrays", lower_array_calls!))
+                add!(pm, FunctionPass("LowerArraysGPUGC", lower_array_calls_gc!))
             else
                 add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!))
+                add!(pm, FunctionPass("LowerArraysNoGC", lower_array_calls_nogc!))
             end
+            
             aggressive_dce!(pm) # remove dead uses of ptls
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
@@ -521,14 +523,14 @@ end
 
 # Emits instructions that allocate a particular number of bytes
 # of GC-managed memory. No headroom is included. No tags are set.
-function new_bytes!(builder::LLVM.Builder, size)
-    call!(builder, Runtime.get(:gc_malloc_object), [size])
+function new_bytes!(builder::LLVM.Builder, malloc, size)
+    call!(builder, malloc, [size])
 end
 
 # Emits instructions that allocate bytes for an object, including
 # headroom for the object's tag. Also fills in the object's tag if
 # one is provided.
-function new_object!(builder::LLVM.Builder, size, tag::Union{Type, Nothing} = nothing)
+function new_object!(builder::LLVM.Builder, malloc, size, tag::Union{Type, Nothing} = nothing)
     # We need to reserve a single pointer of headroom for the tag.
     # (LateLowerGCFrame depends on us doing that.)
     headroom = Runtime.tag_size
@@ -536,7 +538,7 @@ function new_object!(builder::LLVM.Builder, size, tag::Union{Type, Nothing} = no
     # Call the allocation function and bump the resulting pointer
     # so the headroom sits just in front of the returned pointer.
     total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext()))
-    obj_ptr = new_bytes!(builder, total_size)
+    obj_ptr = new_bytes!(builder, malloc, total_size)
 
     jl_value_t = llvmtype(obj_ptr)
     T_bitcast = LLVM.PointerType(jl_value_t, LLVM.addrspace(jl_value_t))
@@ -585,7 +587,7 @@ function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
         # so the headroom sits just in front of the returned pointer.
         let builder = Builder(JuliaContext())
             position!(builder, call)
-            result_ptr = new_object!(builder, size)
+            result_ptr = new_object!(builder, Runtime.get(:gc_malloc_object), size)
             replace_uses!(call, result_ptr)
             unsafe_delete!(LLVM.parent(call), call)
             dispose(builder)
@@ -793,7 +795,7 @@ end
 # Emits instructions that create a new array. The array's element type
 # must be statically known. Its dimensions are represented as a tuple
 # of LLVM IR values. A pointer to the new array is returned.
-function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple)
+function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple)
     # Since time immemorial, the structure of an array is (quoting from the
     # Julia source code here):
     #
@@ -871,7 +873,7 @@ function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple)
     # Actually allocate the array's contents. We will just always
     # use a separate buffer. Inline data storage is wasteful and
     # harder to implement.
-    data_ptr = new_bytes!(builder, data_bytesize)
+    data_ptr = new_bytes!(builder, malloc, data_bytesize)
 
     # The pointer to the array's data is the first field of the struct.
     push!(fields, data_ptr)
@@ -928,6 +930,7 @@ function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple)
     # to the control structure.
     obj_ptr = new_object!(
         builder,
+        malloc,
         ConstantInt(convert(LLVMType, Csize_t), sizeof(layout, struct_type)),
         array_type)
     struct_ptr = bitcast!(
@@ -948,7 +951,7 @@ function new_array!(builder::LLVM.Builder, array_type::Type, dims::Tuple)
 end
 
 # Lowers function calls that pertain to array operations.
-function lower_array_calls!(fun::LLVM.Function)
+function lower_array_calls!(fun::LLVM.Function, malloc)
     changed_any = false
     visit_literal_pointer_calls(fun) do call, name
         if name == :jl_alloc_array_1d
@@ -960,7 +963,7 @@ function lower_array_calls!(fun::LLVM.Function)
                 array_type = unsafe_pointer_to_objref(array_type_ptr)
                 let builder = Builder(JuliaContext())
                     position!(builder, call)
-                    new_array = new_array!(builder, array_type, (args[2],))
+                    new_array = new_array!(builder, malloc, array_type, (args[2],))
                     replace_uses!(call, new_array)
                     unsafe_delete!(LLVM.parent(call), call)
                     dispose(builder)
@@ -972,6 +975,14 @@ function lower_array_calls!(fun::LLVM.Function)
     return changed_any
 end
 
+function lower_array_calls_gc!(fun::LLVM.Function)
+    lower_array_calls!(fun, Runtime.get(:gc_malloc_object))
+end
+
+function lower_array_calls_nogc!(fun::LLVM.Function)
+    lower_array_calls!(fun, Runtime.get(:gc_pool_alloc))
+end
+
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
 #
 # this assumes and checks that the TLS is unused, which should be the case for most GPU code

From ec5290ecb933938b9cbe4d3861ccae59dbe22819 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 15 Apr 2019 17:45:42 +0200
Subject: [PATCH 094/146] Define a new 'arrays' benchmark

---
 gc-benchmarks/arrays.jl            | 51 ++++++++++++++++++++++++++++++
 gc-benchmarks/genetic-algorithm.jl | 23 +-------------
 gc-benchmarks/run-all.jl           |  1 +
 gc-benchmarks/static-arrays.jl     |  8 ++---
 gc-benchmarks/utils.jl             | 26 +++++++++++++++
 5 files changed, 83 insertions(+), 26 deletions(-)
 create mode 100644 gc-benchmarks/arrays.jl

diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl
new file mode 100644
index 00000000..ac1fd75f
--- /dev/null
+++ b/gc-benchmarks/arrays.jl
@@ -0,0 +1,51 @@
+module Arrays
+
+using CUDAdrv, CUDAnative
+import ..CUDArandom: LinearCongruentialGenerator, next
+
+# This benchmark allocates a hierarchy of fairly modest Julia arrays.
+# Some arrays remain alive, others become unreachable. This benchmark
+# seeks to ascertain the performance of the allocator and garbage collector.
+
+const thread_count = 64
+const insertion_count = 80
+
+@noinline function escape(value)
+    Base.pointer_from_objref(value)
+    value
+end
+
+function insert(target::Array{Any, 1}, generator::LinearCongruentialGenerator)
+    while true
+        index = next(generator, 1, length(target))
+        elem = target[index]
+        if isa(elem, Array{Any, 1})
+            if length(elem) > 0
+                target = elem
+                continue
+            end
+        end
+
+        target[index] = Any[Any[] for _ in 1:5]
+        return
+    end
+end
+
+function kernel()
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    generator = LinearCongruentialGenerator(i)
+    toplevel = Any[Any[] for _ in 1:10]
+    for i in 1:insertion_count
+        insert(toplevel, generator)
+    end
+    return
+end
+
+end
+
+function arrays_benchmark()
+    # Run the kernel.
+    @cuda_sync threads=Arrays.thread_count Arrays.kernel()
+end
+
+@cuda_benchmark "arrays" arrays_benchmark()
diff --git a/gc-benchmarks/genetic-algorithm.jl b/gc-benchmarks/genetic-algorithm.jl
index 69a7fcac..b4e3aa8a 100644
--- a/gc-benchmarks/genetic-algorithm.jl
+++ b/gc-benchmarks/genetic-algorithm.jl
@@ -6,6 +6,7 @@ module GeneticAlgorithm
 
 using CUDAnative, CUDAdrv
 import ..LinkedList: List, Nil, Cons, foldl, map, max
+import ..CUDArandom: LinearCongruentialGenerator, next
 
 # A character in our genetic algorithm, based loosely on Fallout's SPECIAL system.
 mutable struct Character
@@ -18,28 +19,6 @@ mutable struct Character
     luck::Int
 end
 
-# A linear congruential pseudo-random number generator.
-mutable struct LinearCongruentialGenerator
-    modulus::Int
-    a::Int
-    c::Int
-    state::Int
-end
-
-LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed)
-
-# Requests a pseudo-random number.
-function next(generator::LinearCongruentialGenerator)::Int
-    generator.state = (generator.a * generator.state + generator.c) % generator.modulus
-    generator.state
-end
-
-# Requests a pseudo-random number that is at least as great as `lower`
-# and less than `upper`.
-function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int
-    lower + next(generator) % (upper - lower)
-end
-
 # Computes the mean of two integers.
 function mean(a::Int, b::Int)::Int
     div(a + b, 2)
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index e6382718..1611a509 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -2,6 +2,7 @@ using CUDAdrv, CUDAnative, Test
 
 include("utils.jl")
 
+include("arrays.jl")
 include("binary-tree.jl")
 include("linked-list.jl")
 include("matrix.jl")
diff --git a/gc-benchmarks/static-arrays.jl b/gc-benchmarks/static-arrays.jl
index 50c6a3ac..88fcfa43 100644
--- a/gc-benchmarks/static-arrays.jl
+++ b/gc-benchmarks/static-arrays.jl
@@ -1,4 +1,4 @@
-module Arrays
+module StaticArraysBench
 
 using CUDAdrv, CUDAnative, StaticArrays
 
@@ -45,9 +45,9 @@ end
 
 end
 
-function arrays_benchmark()
+function static_arrays_benchmark()
     # Run the kernel.
-    @cuda_sync threads=Arrays.thread_count Arrays.kernel()
+    @cuda_sync threads=StaticArraysBench.thread_count StaticArraysBench.kernel()
 end
 
-@cuda_benchmark "static arrays" arrays_benchmark()
+@cuda_benchmark "static arrays" static_arrays_benchmark()
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 5623bb02..aa0df174 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -72,3 +72,29 @@ end
 function run_benchmarks()
     BenchmarkTools.run(suite)
 end
+
+module CUDArandom
+
+# A linear congruential pseudo-random number generator.
+mutable struct LinearCongruentialGenerator
+    modulus::Int
+    a::Int
+    c::Int
+    state::Int
+end
+
+LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed)
+
+# Requests a pseudo-random number.
+function next(generator::LinearCongruentialGenerator)::Int
+    generator.state = (generator.a * generator.state + generator.c) % generator.modulus
+    generator.state
+end
+
+# Requests a pseudo-random number that is at least as great as `lower`
+# and less than `upper`.
+function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int
+    lower + next(generator) % (upper - lower)
+end
+
+end

From 47a52b44b5167e87bee31f773e419d5c5e9c7968 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 15 Apr 2019 17:46:22 +0200
Subject: [PATCH 095/146] Rename "gpu-array" example to "stdlib-array"

---
 examples/{gpu-array.jl => stdlib-array.jl} | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
 rename examples/{gpu-array.jl => stdlib-array.jl} (75%)

diff --git a/examples/gpu-array.jl b/examples/stdlib-array.jl
similarity index 75%
rename from examples/gpu-array.jl
rename to examples/stdlib-array.jl
index ce97b4cc..b5b17cc2 100644
--- a/examples/gpu-array.jl
+++ b/examples/stdlib-array.jl
@@ -1,4 +1,4 @@
-using CUDAdrv, CUDAnative, StaticArrays, InteractiveUtils
+using CUDAdrv, CUDAnative, StaticArrays
 
 # This example allocates an array in a GPU kernel.
 
@@ -12,6 +12,8 @@ end
 function kernel()
     array = [1, 2, 3, 4, 5, 6, 7]
     escape(array)
+    comp = [i * i for i in array]
+    escape(comp)
     return
 end
 

From 92847f0c26ea936155abfe493b4a6b1fc5a18abb Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 15 Apr 2019 17:58:30 +0200
Subject: [PATCH 096/146] Introduce unreachable objects in array benchmark

---
 gc-benchmarks/arrays.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl
index ac1fd75f..81def2d4 100644
--- a/gc-benchmarks/arrays.jl
+++ b/gc-benchmarks/arrays.jl
@@ -21,8 +21,10 @@ function insert(target::Array{Any, 1}, generator::LinearCongruentialGenerator)
         elem = target[index]
         if isa(elem, Array{Any, 1})
             if length(elem) > 0
-                target = elem
-                continue
+                if next(generator, 0, 2) == 0
+                    target = elem
+                    continue
+                end
             end
         end
 

From 844c5578277ba59351a122d8e0ff4198d626d895 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 15 Apr 2019 18:24:37 +0200
Subject: [PATCH 097/146] Define an array reduction benchmark

---
 gc-benchmarks/array-reduction.jl | 43 ++++++++++++++++++++++++++++++++
 gc-benchmarks/arrays.jl          |  5 ----
 2 files changed, 43 insertions(+), 5 deletions(-)
 create mode 100644 gc-benchmarks/array-reduction.jl

diff --git a/gc-benchmarks/array-reduction.jl b/gc-benchmarks/array-reduction.jl
new file mode 100644
index 00000000..24d4492e
--- /dev/null
+++ b/gc-benchmarks/array-reduction.jl
@@ -0,0 +1,43 @@
+module ArrayReduction
+
+using CUDAdrv, CUDAnative
+
+# This benchmark approximates pi by naively constructing an array comprehension
+# for the Madhava–Leibniz series and computing its sum. It does this a few times
+# to achieve a respectable run time.
+
+const thread_count = 256
+const series_length = 200
+const runs = 20
+
+function iterative_sum(elements::Array{T})::T where T
+    result = zero(T)
+    for i in elements
+        result += i
+    end
+    return result
+end
+
+function kernel(destination)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    unsafe_store!(destination, 0.0, i)
+    for _ in 1:runs
+        series = [CUDAnative.pow(-1 / 3.0, Float64(k)) / (2.0 * k + 1.0) for k in 0:series_length]
+        unsafe_store!(destination, unsafe_load(destination, i) + CUDAnative.sqrt(12.0) * iterative_sum(series), i)
+    end
+    return
+end
+
+end
+
+function array_reduction_benchmark()
+    destination_array = Mem.alloc(Float64, ArrayReduction.thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array)
+
+    # Run the kernel.
+    @cuda_sync threads=ArrayReduction.thread_count ArrayReduction.kernel(destination_pointer)
+
+    @test Mem.download(Float64, destination_array, ArrayReduction.thread_count) ≈ ArrayReduction.runs .* fill(Float64(pi), ArrayReduction.thread_count)
+end
+
+@cuda_benchmark "array reduction" array_reduction_benchmark()
diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl
index 81def2d4..1f247f6c 100644
--- a/gc-benchmarks/arrays.jl
+++ b/gc-benchmarks/arrays.jl
@@ -10,11 +10,6 @@ import ..CUDArandom: LinearCongruentialGenerator, next
 const thread_count = 64
 const insertion_count = 80
 
-@noinline function escape(value)
-    Base.pointer_from_objref(value)
-    value
-end
-
 function insert(target::Array{Any, 1}, generator::LinearCongruentialGenerator)
     while true
         index = next(generator, 1, length(target))

From 8a11408c68796641fd74b3c25c92741bf5b93f5e Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 15 Apr 2019 18:27:46 +0200
Subject: [PATCH 098/146] Include array reduction benchmark in "run-all.jl"

---
 gc-benchmarks/run-all.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 1611a509..272cdf61 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -2,6 +2,7 @@ using CUDAdrv, CUDAnative, Test
 
 include("utils.jl")
 
+include("array-reduction.jl")
 include("arrays.jl")
 include("binary-tree.jl")
 include("linked-list.jl")

From 187417485186087cf76d9325881a31b1306c3b24 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 19 Apr 2019 15:18:56 +0200
Subject: [PATCH 099/146] Add a bitvector benchmark

---
 gc-benchmarks/bitvector.jl | 101 +++++++++++++++++++++++++++++++++++++
 gc-benchmarks/run-all.jl   |   1 +
 2 files changed, 102 insertions(+)
 create mode 100644 gc-benchmarks/bitvector.jl

diff --git a/gc-benchmarks/bitvector.jl b/gc-benchmarks/bitvector.jl
new file mode 100644
index 00000000..59892e92
--- /dev/null
+++ b/gc-benchmarks/bitvector.jl
@@ -0,0 +1,101 @@
+module Bitvector
+
+import Base: +, *, <<
+using CUDAnative
+
+# This benchmark performs naive arithmetic on bitvectors.
+# The goal of the benchmark is to gauge how GPU-unaware
+# standard library code that depends on arrays behaves when
+# used in a GPU kernel.
+
+const thread_count = 256
+
+@noinline function escape(value)
+    Base.pointer_from_objref(value)
+    value
+end
+
+mutable struct BitInteger{N}
+    bits::BitVector
+end
+
+function zero(::Type{BitInteger{N}})::BitInteger{N} where N
+    BitInteger{N}(falses(N))
+end
+
+function one(::Type{BitInteger{N}})::BitInteger{N} where N
+    result = falses(N)
+    result[1] = true
+    return BitInteger{N}(result)
+end
+
+function +(a::BitInteger{N}, b::BitInteger{N})::BitInteger{N} where N
+    carry = false
+    c = falses(N)
+    for i in 1:N
+        s = Int(a.bits[i]) + Int(b.bits[i]) + Int(carry)
+        if s == 1
+            carry = false
+            c[i] = true
+        elseif s == 2
+            carry = true
+        elseif s == 3
+            carry = true
+            c[i] = true
+        end
+    end
+    return BitInteger{N}(c)
+end
+
+function <<(a::BitInteger{N}, amount::Integer)::BitInteger{N} where N
+    c = falses(N)
+    for i in 1:(N - amount)
+        c[i + amount] = a.bits[i]
+    end
+    return BitInteger{N}(c)
+end
+
+function *(a::BitInteger{N}, b::BitInteger{N})::BitInteger{N} where N
+    c = zero(BitInteger{N})
+    for i in 1:N
+        if a.bits[i]
+            c += (b << (i - 1))
+        end
+    end
+    return c
+end
+
+function factorial(::Type{BitInteger{N}}, value::Integer)::BitInteger{N} where N
+    accumulator = one(BitInteger{N})
+    iv = one(BitInteger{N})
+    for i in 1:value
+        accumulator *= iv
+        iv += one(BitInteger{N})
+    end
+    return accumulator
+end
+
+function to_int(value::BitInteger{N})::Int where N
+    result = 0
+    for i in 1:N
+        if value.bits[i]
+            result += (1 << (i - 1))
+        end
+    end
+    return result
+end
+
+function kernel()
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    factorial(BitInteger{128}, 10)
+    return
+end
+
+end
+
+function bitvector_benchmark()
+    # Run the kernel.
+    @cuda_sync threads=Bitvector.thread_count Bitvector.kernel()
+end
+
+@cuda_benchmark "bitvector" bitvector_benchmark()
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 272cdf61..7a0ff8f4 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -5,6 +5,7 @@ include("utils.jl")
 include("array-reduction.jl")
 include("arrays.jl")
 include("binary-tree.jl")
+include("bitvector.jl")
 include("linked-list.jl")
 include("matrix.jl")
 include("ssa-opt.jl")

From de14a7f423b15b7bcdd9e2e49add48fbd38d8864 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 19 Apr 2019 15:35:34 +0200
Subject: [PATCH 100/146] Add a 'malloc' keyword argument to the @cuda macro

---
 src/compiler/common.jl | 9 +++++++--
 src/execution.jl       | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/compiler/common.jl b/src/compiler/common.jl
index 33232b82..e3281e90 100644
--- a/src/compiler/common.jl
+++ b/src/compiler/common.jl
@@ -12,6 +12,11 @@ struct CompilerJob
     maxthreads::Union{Nothing,CuDim}
     blocks_per_sm::Union{Nothing,Integer}
     maxregs::Union{Nothing,Integer}
+    # The name of the 'malloc' function to use when allocating memory.
+    # A transform will rewrite all calls to 'malloc' to use this function
+    # instead. The 'malloc' signature must be 'void* malloc(size_t)' or
+    # compatible.
+    malloc::String
     # Indicates whether the GPU GC or the "malloc never free"
     # GC intrinsic lowering strategy is to be used. The former
     # is used when this field is `true`; the latter when it is
@@ -21,8 +26,8 @@ struct CompilerJob
     CompilerJob(f, tt, cap, kernel;
                     minthreads=nothing, maxthreads=nothing,
                     blocks_per_sm=nothing, maxregs=nothing,
-                    gc=false) =
-        new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, gc)
+                    malloc="malloc",gc=false) =
+        new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, malloc, gc)
 end
 
 # global job reference
diff --git a/src/execution.jl b/src/execution.jl
index f8b902e6..cf61cb74 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -9,7 +9,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
 # the code it generates, or the execution
 function split_kwargs(kwargs)
     macro_kws    = [:dynamic, :init]
-    compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs]
+    compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :malloc]
     call_kws     = [:cooperative, :blocks, :threads, :shmem, :stream]
     macro_kwargs = []
     compiler_kwargs = []

From 2908cdd6a36e8bcecc93f275e62af64f1ef0be1f Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 19 Apr 2019 15:43:57 +0200
Subject: [PATCH 101/146] Add a pass that rewrites calls to 'malloc'

---
 src/compiler/optim.jl | 70 ++++++++++++++++++++++++++++++++++++++++++-
 src/device/runtime.jl |  9 ++++--
 2 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index a788a53b..0ca0cbfb 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -78,7 +78,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
                 add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!))
                 add!(pm, FunctionPass("LowerArraysNoGC", lower_array_calls_nogc!))
             end
-            
+
             aggressive_dce!(pm) # remove dead uses of ptls
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
@@ -89,6 +89,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function; int
 
             run!(pm, mod)
         end
+        replace_malloc!(mod, job.malloc)
     end
 
     # PTX-specific optimizations
@@ -983,6 +984,73 @@ function lower_array_calls_nogc!(fun::LLVM.Function)
     lower_array_calls!(fun, Runtime.get(:gc_pool_alloc))
 end
 
+# Replaces all uses of a function in a particular module with
+# a compatible function.
+function replace_function!(mod::LLVM.Module, old_name::String, new_name::String; include_oom_check=false)
+    if new_name == old_name
+        # There's nothing to replace if the new function is the same as
+        # the old function.
+        return false
+    end
+
+    # Otherwise, we'll try and find the malloc function.
+    if !haskey(functions(mod), old_name)
+        # If the old function doesn't even appear in the module, then it's not in
+        # use and we can stop right here.
+        return false
+    end
+
+    old_function = functions(mod)[old_name]
+
+    if haskey(functions(mod), new_name)
+        new_function = functions(mod)[new_name]
+    else
+        # Create a new function.
+        new_function = LLVM.Function(
+            mod,
+            new_name,
+            eltype(llvmtype(old_function)::LLVM.PointerType)::LLVM.FunctionType)
+    end
+
+    if include_oom_check
+        wrapper = LLVM.Function(
+            mod,
+            new_name * "_checked",
+            eltype(llvmtype(new_function)::LLVM.PointerType)::LLVM.FunctionType)
+
+        Builder(JuliaContext()) do builder
+            entry = BasicBlock(wrapper, "entry", JuliaContext())
+            position!(builder, entry)
+
+            result = call!(builder, new_function, collect(parameters(wrapper)))
+            check_args = LLVM.Value[result]
+            append!(check_args, parameters(wrapper))
+            call!(builder, Runtime.get(:check_out_of_memory), check_args)
+            ret!(builder, result)
+        end
+
+        new_function = wrapper
+    end
+
+    # Replace all uses of the old function with the new function.
+    replace_uses!(old_function, new_function)
+
+    return true
+end
+
+# Replaces all uses of the malloc function in a particular module with
+# a compatible function with the specified name.
+function replace_malloc!(mod::LLVM.Module, malloc_name::String)
+    if malloc_name == "malloc"
+        # There's nothing to replace if the new malloc is the same as
+        # the old malloc.
+        return false
+    end
+
+    return replace_function!(mod, "malloc", malloc_name) ||
+        replace_function!(mod, "ptx_gc_pool_alloc", malloc_name; include_oom_check=true)
+end
+
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
 #
 # this assumes and checks that the TLS is unused, which should be the case for most GPU code
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 6b7c792e..db8c7a6d 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -129,15 +129,20 @@ end
 
 function gc_pool_alloc(sz::Csize_t)
     ptr = malloc(sz)
+    check_out_of_memory(ptr, sz)
+    return unsafe_pointer_to_objref(ptr)
+end
+
+function check_out_of_memory(ptr::Ptr{Cvoid}, sz::Csize_t)
     if ptr == C_NULL
         @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
         throw(OutOfMemoryError())
     end
-    return unsafe_pointer_to_objref(ptr)
+    return
 end
 
 compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue)
-
+compile(check_out_of_memory, Cvoid, (Ptr{Cvoid}, Csize_t))
 
 ## boxing and unboxing
 

From f6107eb947fc53e3dfb72b9c544272180809ed54 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 19 Apr 2019 17:31:43 +0200
Subject: [PATCH 102/146] Recompile runtime library for different allocators

---
 src/compiler/driver.jl |  2 +-
 src/compiler/rtlib.jl  | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 70ea33ba..97b87e7a 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -68,7 +68,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true,
     # preload libraries
     if libraries
         libdevice = load_libdevice(job.cap)
-        runtime = load_runtime(job.cap)
+        runtime = load_runtime(job.cap, job.malloc)
     end
 
     need_library(lib) = any(f -> isdeclaration(f) &&
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
index ad82f984..8293b098 100644
--- a/src/compiler/rtlib.jl
+++ b/src/compiler/rtlib.jl
@@ -122,30 +122,30 @@ end
 
 ## functionality to build the runtime library
 
-function emit_function!(mod, cap, f, types, name)
+function emit_function!(mod, cap, f, types, name, malloc)
     tt = Base.to_tuple_type(types)
     # Optimize the module that defines the function, but don't
     # internalize symbols in that function yet: internalizing
     # globals may de-alias references to globals in the runtime
     # library from equivalent references in the kernel.
-    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false);
+    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false; malloc=malloc);
                              libraries=false, internalize=false)
     LLVM.name!(entry, name)
     link!(mod, new_mod)
 end
 
-function build_runtime(cap)
+function build_runtime(cap, malloc)
     mod = LLVM.Module("CUDAnative run-time library", JuliaContext())
 
     for method in values(Runtime.methods)
-        emit_function!(mod, cap, method.def, method.types, method.llvm_name)
+        emit_function!(mod, cap, method.def, method.types, method.llvm_name, malloc)
     end
 
     mod
 end
 
-function load_runtime(cap)
-    name = "cudanative.$(cap.major)$(cap.minor).bc"
+function load_runtime(cap, malloc)
+    name = "cudanative.$(malloc).$(cap.major)$(cap.minor).bc"
     path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name)
     mkpath(dirname(path))
 
@@ -155,8 +155,8 @@ function load_runtime(cap)
                 parse(LLVM.Module, read(io), JuliaContext())
             end
         else
-            @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device, this might take a while..."
-            lib = build_runtime(cap)
+            @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device (allocating with '$malloc'), this might take a while..."
+            lib = build_runtime(cap, malloc)
             open(path, "w") do io
                 write(io, lib)
             end

From 84ffff5628c6fa5ea2f383901ad1bfdc492bb9cd Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 19 Apr 2019 17:32:20 +0200
Subject: [PATCH 103/146] Use 'gc_malloc' as allocator when @cuda_gc is
 specified

---
 src/gc.jl | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 82f5c0e2..00805de7 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -79,6 +79,12 @@ function data_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8}
     Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(FreeListRecord)
 end
 
+# Takes a pointer to the first byte of data managed by an allocation record
+# and produces a pointer to the record itself.
+function record_pointer(data::Ptr{UInt8})::Ptr{FreeListRecord}
+    Base.unsafe_convert(Ptr{FreeListRecord}, record) - sizeof(FreeListRecord)
+end
+
 # Gets a pointer to the first byte of data no longer managed by an allocation record.
 function data_end_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8}
     data_pointer(record) + unsafe_load(@get_field_pointer(record, :size))
@@ -753,7 +759,7 @@ function gc_take_any_list_entry(
         free_list_item = unsafe_load(free_list_ptr)
 
         if free_list_item == C_NULL
-            break
+            return C_NULL
         end
 
         result = gc_take_list_entry(free_list_ptr, free_list_item, bytesize)
@@ -763,7 +769,6 @@ function gc_take_any_list_entry(
 
         free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{FreeListRecord}}
     end
-    return C_NULL
 end
 
 # Tries to allocate a chunk of memory from a free list.
@@ -948,11 +953,17 @@ function gc_transfer_and_malloc(
     transfer_bytesize::Csize_t,
     alloc_bytesize::Csize_t)::Ptr{UInt8}
 
-    gc_transfer_and_malloc(
+    result = gc_transfer_and_malloc(
         from_arena,
         get_free_list(to_arena),
         transfer_bytesize,
         alloc_bytesize)
+
+    writer_locked(get_lock(to_arena)) do
+        unsafe_store!(@get_field_pointer(to_arena, :can_restock), true)
+    end
+
+    return result
 end
 
 """
@@ -1439,10 +1450,9 @@ function iterate_allocated(fun::Function, arena::Ptr{FreeListArena})
     iterate_allocation_records(fun, allocation_list_head)
 end
 
-# Iterates through all active allocation records in a GC arena.
-function iterate_allocated(fun::Function, arena::Ptr{BodegaArena})
-    # Compose a set that contains all data addresses of chunks that
-    # are on the shelves.
+# Composes a set that contains all data addresses of chunks that
+# are on the shelves.
+function chunks_on_shelves(arena::Ptr{BodegaArena})
     arena_data = unsafe_load(arena)
     chunks_on_shelves = Set{Ptr{UInt8}}()
     for i in 1:arena_data.shelf_count
@@ -1451,11 +1461,17 @@ function iterate_allocated(fun::Function, arena::Ptr{BodegaArena})
             push!(chunks_on_shelves, unsafe_load(shelf.chunks, j))
         end
     end
+    return chunks_on_shelves
+end
+
+# Iterates through all active allocation records in a GC arena.
+function iterate_allocated(fun::Function, arena::Ptr{BodegaArena})
+    shelf_chunks = chunks_on_shelves(arena)
 
     # Now iterate through the allocation list, ignoring records that have
     # been placed on the shelves.
     iterate_allocated(get_free_list(arena)) do record
-        if !(data_pointer(record) in chunks_on_shelves)
+        if !(data_pointer(record) in shelf_chunks)
             fun(record)
         end
     end
@@ -1504,8 +1520,15 @@ end
 
 # Frees all dead blocks in an arena.
 function gc_free_garbage(arena::Ptr{BodegaArena}, live_blocks::Set{Ptr{FreeListRecord}})
+    # Mark chunks on shelves as live.
+    all_live_blocks = Set{Ptr{FreeListRecord}}(live_blocks)
+    shelf_chunks = chunks_on_shelves(arena)
+    for chunk_ptr in shelf_chunks
+        push!(all_live_blocks, record_pointer(chunk_ptr))
+    end
+
     # Free garbage in the free list sub-arena.
-    gc_free_garbage(get_free_list(arena), live_blocks)
+    gc_free_garbage(get_free_list(arena), all_live_blocks)
 
     # Mark the arena as ready for restocking.
     unsafe_store!(@get_field_pointer(arena, :can_restock), true)
@@ -1850,7 +1873,7 @@ macro cuda_gc(ex...)
                     # Standard kernel setup logic.
                     local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
                     local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-                    local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, $(map(esc, compiler_kwargs)...))
+                    local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, malloc="ptx_gc_malloc", $(map(esc, compiler_kwargs)...))
                     CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...))
                     gc_report.elapsed_time = Base.@elapsed begin
                         kernel(kernel_args...; $(map(esc, call_kwargs)...))

From e14b9b336f5016bed02f49474338c7c5cc482d5e Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 19 Apr 2019 17:32:52 +0200
Subject: [PATCH 104/146] Implement array expansion method

---
 src/compiler/optim.jl |  50 ++++++---------
 src/device/runtime.jl | 146 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 158 insertions(+), 38 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 0ca0cbfb..2ce36d1f 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -871,6 +871,12 @@ function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple
         LLVM.ConstantInt(convert(LLVMType, Csize_t), element_size),
         element_count)
 
+    if element_size == Csize_t(1) && length(dims) == 1
+        # If we're allocating an array of bytes, we will throw in an extra
+        # byte at the end for compatibility with Julia's ABI.
+        data_bytesize = add!(builder, data_bytesize, LLVM.ConstantInt(convert(LLVMType, Csize_t), 1))
+    end
+
     # Actually allocate the array's contents. We will just always
     # use a separate buffer. Inline data storage is wasteful and
     # harder to implement.
@@ -905,6 +911,9 @@ function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple
     # Add the flags to the `jl_array_t` struct.
     push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), flags))
 
+    # Set the 'elsize' field.
+    push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(element_size)))
+
     # Set the 'offset' field to zero (the array is not a slice).
     push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(0)))
 
@@ -955,8 +964,8 @@ end
 function lower_array_calls!(fun::LLVM.Function, malloc)
     changed_any = false
     visit_literal_pointer_calls(fun) do call, name
+        args = collect(operands(call))[1:end - 1]
         if name == :jl_alloc_array_1d
-            args = collect(operands(call))[1:end - 1]
             is_ptr, array_type_ptr = to_literal_pointer(args[1])
             if is_ptr
                 # We can lower array creation calls if we know the type
@@ -971,6 +980,14 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
                 end
             end
             changed_any = true
+        elseif name == :jl_array_grow_end
+            let builder = Builder(JuliaContext())
+                position!(builder, call)
+                new_call = call!(builder, Runtime.get(name), args)
+                replace_uses!(call, new_call)
+                unsafe_delete!(LLVM.parent(call), call)
+                dispose(builder)
+            end
         end
     end
     return changed_any
@@ -986,7 +1003,7 @@ end
 
 # Replaces all uses of a function in a particular module with
 # a compatible function.
-function replace_function!(mod::LLVM.Module, old_name::String, new_name::String; include_oom_check=false)
+function replace_function!(mod::LLVM.Module, old_name::String, new_name::String)
     if new_name == old_name
         # There's nothing to replace if the new function is the same as
         # the old function.
@@ -1012,26 +1029,6 @@ function replace_function!(mod::LLVM.Module, old_name::String, new_name::String;
             eltype(llvmtype(old_function)::LLVM.PointerType)::LLVM.FunctionType)
     end
 
-    if include_oom_check
-        wrapper = LLVM.Function(
-            mod,
-            new_name * "_checked",
-            eltype(llvmtype(new_function)::LLVM.PointerType)::LLVM.FunctionType)
-
-        Builder(JuliaContext()) do builder
-            entry = BasicBlock(wrapper, "entry", JuliaContext())
-            position!(builder, entry)
-
-            result = call!(builder, new_function, collect(parameters(wrapper)))
-            check_args = LLVM.Value[result]
-            append!(check_args, parameters(wrapper))
-            call!(builder, Runtime.get(:check_out_of_memory), check_args)
-            ret!(builder, result)
-        end
-
-        new_function = wrapper
-    end
-
     # Replace all uses of the old function with the new function.
     replace_uses!(old_function, new_function)
 
@@ -1041,14 +1038,7 @@ end
 # Replaces all uses of the malloc function in a particular module with
 # a compatible function with the specified name.
 function replace_malloc!(mod::LLVM.Module, malloc_name::String)
-    if malloc_name == "malloc"
-        # There's nothing to replace if the new malloc is the same as
-        # the old malloc.
-        return false
-    end
-
-    return replace_function!(mod, "malloc", malloc_name) ||
-        replace_function!(mod, "ptx_gc_pool_alloc", malloc_name; include_oom_check=true)
+    return replace_function!(mod, "malloc", malloc_name)
 end
 
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index db8c7a6d..d5f6f693 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -129,20 +129,14 @@ end
 
 function gc_pool_alloc(sz::Csize_t)
     ptr = malloc(sz)
-    check_out_of_memory(ptr, sz)
-    return unsafe_pointer_to_objref(ptr)
-end
-
-function check_out_of_memory(ptr::Ptr{Cvoid}, sz::Csize_t)
     if ptr == C_NULL
         @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
         throw(OutOfMemoryError())
     end
-    return
+    return unsafe_pointer_to_objref(ptr)
 end
 
 compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue)
-compile(check_out_of_memory, Cvoid, (Ptr{Cvoid}, Csize_t))
 
 ## boxing and unboxing
 
@@ -230,6 +224,8 @@ for (T, t) in [Int8   => :int8,  Int16  => :int16,  Int32  => :int32,  Int64  =>
     end
 end
 
+## Garbage collection
+
 # LLVM type of a pointer to a tracked pointer
 function T_pprjlvalue()
     T_pjlvalue = convert(LLVMType, Any, true)
@@ -237,7 +233,8 @@ function T_pprjlvalue()
         LLVM.PointerType(eltype(T_pjlvalue), Tracked))
 end
 
-# Include the GC memory allocation function into the runtime.
+# Include GC memory allocation functions into the runtime.
+compile(CUDAnative.gc_malloc, Ptr{UInt8}, (Csize_t,))
 compile(CUDAnative.gc_malloc_object, Any, (Csize_t,), T_prjlvalue)
 
 # Include GC frame management functions into the runtime.
@@ -261,4 +258,137 @@ compile(
 compile(CUDAnative.gc_safepoint, Cvoid, ())
 compile(CUDAnative.gc_perma_safepoint, Cvoid, ())
 
+## Arrays
+
+# A data structure that carefully mirrors an in-memory array control
+# structure for Julia arrays, as laid out by the compiler.
+mutable struct Array1D
+    # This is the data layout for Julia arrays, which we adhere to here.
+    # 
+    #     JL_EXTENSION typedef struct {
+    #       JL_DATA_TYPE
+    #       void *data;
+    #     #ifdef STORE_ARRAY_LEN
+    #       size_t length;
+    #     #endif
+    #       jl_array_flags_t flags;
+    #       uint16_t elsize;
+    #       uint32_t offset;  // for 1-d only. does not need to get big.
+    #       size_t nrows;
+    #       union {
+    #           // 1d
+    #           size_t maxsize;
+    #           // Nd
+    #           size_t ncols;
+    #       };
+    #       // other dim sizes go here for ndims > 2
+    #
+    #       // followed by alignment padding and inline data, or owner pointer
+    #     } jl_array_t;
+
+    data::Ptr{UInt8}
+    length::Csize_t
+    flags::UInt16
+    elsize::UInt16
+    offset::UInt32
+    nrows::Csize_t
+    maxsize::Csize_t
+end
+
+function zero_fill!(ptr::Ptr{UInt8}, count::Integer)
+    for i in 1:count
+        unsafe_store!(ptr, UInt8(0), count)
+    end
+    return
+end
+
+function memmove!(dst::Ptr{UInt8}, src::Ptr{UInt8}, sz::Integer)
+    if src < dst
+        for i in 1:sz
+            unsafe_store!(dst, unsafe_load(src, i), i)
+        end
+    else
+        for i in sz:-1:1
+            unsafe_store!(dst, unsafe_load(src, i), i)
+        end
+    end
+end
+
+# Resize the buffer to a max size of `newlen`
+# The buffer can either be newly allocated or realloc'd, the return
+# value is true if a new buffer is allocated and false if it is realloc'd.
+# the caller needs to take care of moving the data from the old buffer
+# to the new one if necessary.
+# When this function returns, the `.data` pointer always points to
+# the **beginning** of the new buffer.
+function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool
+    elsz = Csize_t(a.elsize)
+    nbytes = newlen * elsz
+    oldnbytes = a.maxsize * elsz
+
+    if elsz == 1
+        nbytes += 1
+        oldnbytes += 1
+    end
+
+    # Allocate a new buffer. Note that 'malloc' will get replaced with
+    # the "right" allocation function for the environment in which this
+    # function is compiled. So if the GC is enabled, then 'malloc' will
+    # actually call 'gc_malloc'.
+    a.data = malloc(nbytes)
+    zero_fill!(a.data + oldnbytes, nbytes - oldnbytes)
+    a.maxsize = newlen
+    return true
+end
+
+function jl_array_grow_at_end(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t)
+    data = a.data
+    elsz = Csize_t(a.elsize)
+    reqmaxsize = a.offset + n + inc
+    has_gap = n > idx
+    if reqmaxsize > a.maxsize
+        nb1 = idx * elsz
+        nbinc = inc * elsz
+
+        if reqmaxsize < 4
+            newmaxsize = Csize_t(4)
+        elseif reqmaxsize >= a.maxsize * 2
+            newmaxsize = reqmaxsize
+        else
+            newmaxsize = a.maxsize * 2
+        end
+
+        newbuf = array_resize_buffer(a, newmaxsize)
+        newdata = a.data + a.offset * elsz
+        if newbuf
+            memmove!(newdata, data, nb1)
+            if has_gap
+                memmove!(newdata + nb1 + nbinc, data + nb1, n * elsz - nb1)
+            end
+        elseif has_gap
+            memmove!(newdata + nb1 + nbinc, newdata + nb1, n * elsz - nb1)
+        end
+        a.data = data = newdata
+    end
+
+    newnrows = n + inc
+    a.length = newnrows
+    a.nrows = newnrows
+    zero_fill!(data + idx * elsz, inc * elsz)
+    return
+end
+
+function jl_array_grow_end(a::Array1D, inc::Csize_t)
+    n = a.nrows
+    jl_array_grow_at_end(a, n, inc, n)
+    return
+end
+
+compile(
+    jl_array_grow_end,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
 end

From 68f474727c2f8a3ffbbe5deaa8037db8df4ca322 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 19 Apr 2019 17:33:32 +0200
Subject: [PATCH 105/146] Create an array expansion benchmark

---
 gc-benchmarks/array-expansion.jl | 46 ++++++++++++++++++++++++++++++++
 gc-benchmarks/run-all.jl         |  1 +
 2 files changed, 47 insertions(+)
 create mode 100644 gc-benchmarks/array-expansion.jl

diff --git a/gc-benchmarks/array-expansion.jl b/gc-benchmarks/array-expansion.jl
new file mode 100644
index 00000000..95dc9bb0
--- /dev/null
+++ b/gc-benchmarks/array-expansion.jl
@@ -0,0 +1,46 @@
+module ArrayExpansion
+
+using CUDAdrv, CUDAnative
+
+# This benchmark has every thread create arrays and repeatedly
+# append elements to those arrays.
+
+const thread_count = 256
+const array_length = 200
+const runs = 10
+
+function iterative_sum(elements::Array{T})::T where T
+    result = zero(T)
+    for i in elements
+        result += i
+    end
+    return result
+end
+
+function kernel(destination)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    result = 0
+    for j in 1:runs
+        array = Int[]
+        for k in 1:array_length
+            push!(array, k)
+        end
+        result += iterative_sum(array)
+    end
+    unsafe_store!(destination, result, i)
+    return
+end
+
+end
+
+function array_expansion_benchmark()
+    destination_array = Mem.alloc(Int, ArrayExpansion.thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int}, destination_array)
+
+    # Run the kernel.
+    @cuda_sync threads=ArrayExpansion.thread_count ArrayExpansion.kernel(destination_pointer)
+
+    @test Mem.download(Int, destination_array, ArrayExpansion.thread_count) == fill(ArrayExpansion.runs * sum(1:ArrayExpansion.array_length), ArrayExpansion.thread_count)
+end
+
+@cuda_benchmark "array expansion" array_expansion_benchmark()
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 7a0ff8f4..c7bb7083 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -2,6 +2,7 @@ using CUDAdrv, CUDAnative, Test
 
 include("utils.jl")
 
+include("array-expansion.jl")
 include("array-reduction.jl")
 include("arrays.jl")
 include("binary-tree.jl")

From a6b49dc3729e729e42394b53835c016a6f6877a9 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 19 Apr 2019 19:05:50 +0200
Subject: [PATCH 106/146] Introduce a special 'managed_malloc' runtime function

---
 src/compiler/common.jl |  8 ++++----
 src/compiler/optim.jl  |  6 +++---
 src/device/runtime.jl  | 21 ++++++++++++++++-----
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/compiler/common.jl b/src/compiler/common.jl
index e3281e90..cdecb6b1 100644
--- a/src/compiler/common.jl
+++ b/src/compiler/common.jl
@@ -12,10 +12,10 @@ struct CompilerJob
     maxthreads::Union{Nothing,CuDim}
     blocks_per_sm::Union{Nothing,Integer}
     maxregs::Union{Nothing,Integer}
-    # The name of the 'malloc' function to use when allocating memory.
-    # A transform will rewrite all calls to 'malloc' to use this function
-    # instead. The 'malloc' signature must be 'void* malloc(size_t)' or
-    # compatible.
+    # The name of the memory allocation function to use when allocating
+    # managed memory. A transform will rewrite all managed memory allocations
+    # to use this function instead. The 'malloc' signature must be
+    # 'void* malloc(size_t)' or compatible.
     malloc::String
     # Indicates whether the GPU GC or the "malloc never free"
     # GC intrinsic lowering strategy is to be used. The former
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 2ce36d1f..d28176a2 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -1035,10 +1035,10 @@ function replace_function!(mod::LLVM.Module, old_name::String, new_name::String)
     return true
 end
 
-# Replaces all uses of the malloc function in a particular module with
-# a compatible function with the specified name.
+# Replaces all uses of the managed memory allocation function in a
+# particular module with a compatible function with the specified name.
 function replace_malloc!(mod::LLVM.Module, malloc_name::String)
-    return replace_function!(mod, "malloc", malloc_name)
+    return replace_function!(mod, "ptx_managed_malloc", malloc_name)
 end
 
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index d5f6f693..278aee28 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -127,8 +127,18 @@ function T_prjlvalue()
     LLVM.PointerType(eltype(T_pjlvalue), Tracked)
 end
 
+# A function that gets replaced by the proper 'malloc' implementation
+# for the context it executes in. When the GC is used, calls to this
+# function are replaced with 'gc_malloc'; otherwise, this function gets
+# rewritten as a call to the allocator, probably 'malloc'.
+@noinline function managed_malloc(sz::Csize_t)
+    malloc(sz)
+end
+
+compile(managed_malloc, Ptr{UInt8}, (Csize_t,))
+
 function gc_pool_alloc(sz::Csize_t)
-    ptr = malloc(sz)
+    ptr = managed_malloc(sz)
     if ptr == C_NULL
         @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
         throw(OutOfMemoryError())
@@ -331,11 +341,12 @@ function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool
         oldnbytes += 1
     end
 
-    # Allocate a new buffer. Note that 'malloc' will get replaced with
+    # Allocate a new buffer. 'managed_malloc' will get replaced with
     # the "right" allocation function for the environment in which this
-    # function is compiled. So if the GC is enabled, then 'malloc' will
-    # actually call 'gc_malloc'.
-    a.data = malloc(nbytes)
+    # function is compiled. So if the GC is enabled, then 'managed_malloc'
+    # will actually call 'gc_malloc'; otherwise, it's probably going to
+    # be 'malloc'.
+    a.data = managed_malloc(nbytes)
     zero_fill!(a.data + oldnbytes, nbytes - oldnbytes)
     a.maxsize = newlen
     return true

From 133101f9dbb7cc101ae958b81929ef2cb18a3768 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Sun, 21 Apr 2019 15:08:39 +0200
Subject: [PATCH 107/146] Implement 'managed_malloc' differently

---
 src/compiler/optim.jl |  4 ++--
 src/device/runtime.jl | 27 ++++++++++++++++++++++-----
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index d28176a2..7b426152 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -1010,7 +1010,7 @@ function replace_function!(mod::LLVM.Module, old_name::String, new_name::String)
         return false
     end
 
-    # Otherwise, we'll try and find the malloc function.
+    # Otherwise, we'll try and find the old function.
     if !haskey(functions(mod), old_name)
         # If the old function doesn't even appear in the module, then it's not in
         # use and we can stop right here.
@@ -1038,7 +1038,7 @@ end
 # Replaces all uses of the managed memory allocation function in a
 # particular module with a compatible function with the specified name.
 function replace_malloc!(mod::LLVM.Module, malloc_name::String)
-    return replace_function!(mod, "ptx_managed_malloc", malloc_name)
+    return replace_function!(mod, "julia.managed_malloc", malloc_name)
 end
 
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 278aee28..d081686d 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -131,14 +131,31 @@ end
 # for the context it executes in. When the GC is used, calls to this
 # function are replaced with 'gc_malloc'; otherwise, this function gets
 # rewritten as a call to the allocator, probably 'malloc'.
-@noinline function managed_malloc(sz::Csize_t)
-    malloc(sz)
-end
+@generated function managed_malloc(sz::Csize_t)
+    T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext()))
+    T_size = convert(LLVMType, Csize_t)
+    T_ptr = convert(LLVMType, Ptr{UInt8})
+
+    # create function
+    llvm_f, _ = create_function(T_ptr, [T_size])
+    mod = LLVM.parent(llvm_f)
+
+    intr = LLVM.Function(mod, "julia.managed_malloc", LLVM.FunctionType(T_pint8, [T_size]))
+
+    # generate IR
+    Builder(JuliaContext()) do builder
+        entry = BasicBlock(llvm_f, "entry", JuliaContext())
+        position!(builder, entry)
+        ptr = call!(builder, intr, [parameters(llvm_f)[1]])
+        jlptr = ptrtoint!(builder, ptr, T_ptr)
+        ret!(builder, jlptr)
+    end
 
-compile(managed_malloc, Ptr{UInt8}, (Csize_t,))
+    call_function(llvm_f, Ptr{UInt8}, Tuple{Csize_t}, :((sz,)))
+end
 
 function gc_pool_alloc(sz::Csize_t)
-    ptr = managed_malloc(sz)
+    ptr = malloc(sz)
     if ptr == C_NULL
         @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
         throw(OutOfMemoryError())

From 73018f5a64685a5b9168c719475e15f4d1c32af8 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 25 Apr 2019 11:20:01 +0200
Subject: [PATCH 108/146] Consider custom malloc during IR checking

---
 src/compiler/validation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl
index e405b8d9..0fbae515 100644
--- a/src/compiler/validation.jl
+++ b/src/compiler/validation.jl
@@ -118,7 +118,7 @@ function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst)
         fn = LLVM.name(dest)
 
         # detect calls to undefined functions
-        if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns)
+        if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) && fn != job.malloc
             # figure out if the function lives in the Julia runtime library
             if libjulia[] == C_NULL
                 paths = filter(Libdl.dllist()) do path

From d505dad208a6eb0c21683783d86b15eee90fa8d4 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Sun, 5 May 2019 16:18:39 +0200
Subject: [PATCH 109/146] Switch to acquire-release semantics for atomics

---
 src/device/threading.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/device/threading.jl b/src/device/threading.jl
index 846db990..96e58f72 100644
--- a/src/device/threading.jl
+++ b/src/device/threading.jl
@@ -10,7 +10,7 @@ export ReaderWriterLock, reader_locked, writer_locked, Mutex, try_lock, unlock
     lt = string(convert(LLVMType, T))
     ir = """
         %ptr = inttoptr $ptr_type %0 to $lt*
-        %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 seq_cst seq_cst
+        %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 acq_rel acquire
         %rv = extractvalue { $lt, i1 } %result, 0
         ret $lt %rv
         """
@@ -22,7 +22,7 @@ end
     lt = string(convert(LLVMType, T))
     ir = """
         %ptr = inttoptr $ptr_type %0 to $lt*
-        %rv = atomicrmw volatile $(String(op)) $lt* %ptr, $lt %1 seq_cst
+        %rv = atomicrmw volatile $(String(op)) $lt* %ptr, $lt %1 acq_rel
         ret $lt %rv
         """
     :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T}, lhs, rhs))

From 1aad738a46f8211465e142330c122de37526a997 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 6 May 2019 16:07:19 +0200
Subject: [PATCH 110/146] Expose GC configuration options

---
 src/execution.jl |   4 +-
 src/gc.jl        | 134 +++++++++++++++++++++++++++++++----------------
 2 files changed, 90 insertions(+), 48 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index cf61cb74..fc930c7c 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -8,7 +8,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
 # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and
 # the code it generates, or the execution
 function split_kwargs(kwargs)
-    macro_kws    = [:dynamic, :init]
+    macro_kws    = [:dynamic, :init, :gc_config]
     compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :malloc]
     call_kws     = [:cooperative, :blocks, :threads, :shmem, :stream]
     macro_kwargs = []
@@ -450,7 +450,7 @@ functionality is included in [`@cuda`](@ref).
 The 'init' keyword argument is a function that takes a kernel as argument and
 sets up an environment for the kernel.
 """
-function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel) where {F,TT}
+function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel, kw...) where {F,TT}
     # Just call the 'init' function for now.
     init(kernel)
 end
diff --git a/src/gc.jl b/src/gc.jl
index 00805de7..a116a687 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -43,7 +43,7 @@
 #   * When the device runs out of GC memory, it requests an interrupt
 #     to mark and sweep.
 
-export @cuda_gc, gc_malloc, gc_malloc_object, gc_collect, gc_safepoint
+export @cuda_gc, gc_malloc, gc_malloc_object, gc_collect, gc_safepoint, GCConfiguration
 
 import Base: length, show
 import Printf: @sprintf
@@ -1126,30 +1126,6 @@ end
 # One megabyte.
 const MiB = 1 << 20
 
-# The initial size of the GC heap, currently 10 MiB.
-const initial_gc_heap_size = 10 * MiB
-
-# The default capacity of a root buffer, i.e., the max number of
-# roots that can be stored per thread. Currently set to
-# 256 roots. That's 2 KiB of roots per thread.
-const default_root_buffer_capacity = 256
-
-# The point at which the global arena is deemed to be starving, i.e.,
-# it no longer contains enough memory to perform basic allocations.
-# If the global arena's free byte count stays below the arena starvation
-# threshold after a collection phase, the collector will allocate
-# additional memory to the arena such that it is no longer starving.
-# The arena starvation threshold is currently set to 4 MiB.
-const global_arena_starvation_threshold = 4 * MiB
-
-# The point at which a local arena is deemed to be starving, i.e.,
-# it no longer contains enough memory to perform basic allocations.
-# If a local arena's free byte count stays below the arena starvation
-# threshold after a collection phase, the collector will allocate
-# additional memory to the arena such that it is no longer starving.
-# The arena starvation threshold is currently set to 1 MiB.
-const local_arena_starvation_threshold = 1 * MiB
-
 # The point at which a tiny arena is deemed to be starving, i.e.,
 # it no longer contains enough memory to perform basic allocations.
 # If a tiny arena's free byte count stays below the arena starvation
@@ -1178,17 +1154,77 @@ end
 
 GCHeapDescription() = GCHeapDescription([])
 
+# A data structure that contains GC configuration parameters.
+struct GCConfiguration
+    # The number of local arenas to create.
+    local_arena_count::Int
+
+    # The max number of roots that can be stored per thread.
+    root_buffer_capacity::Int
+
+    # The point at which the global arena is deemed to be starving, i.e.,
+    # it no longer contains enough memory to perform basic allocations.
+    # If the global arena's free byte count stays below the arena starvation
+    # threshold after a collection phase, the collector will allocate
+    # additional memory to the arena such that it is no longer starving.
+    global_arena_starvation_threshold::Int
+
+    # The initial size of the global arena, in bytes.
+    global_arena_initial_size::Int
+
+    # The point at which a local arena is deemed to be starving, i.e.,
+    # it no longer contains enough memory to perform basic allocations.
+    # If a local arena's free byte count stays below the arena starvation
+    # threshold after a collection phase, the collector will allocate
+    # additional memory to the arena such that it is no longer starving.
+    local_arena_starvation_threshold::Int
+
+    # The initial size of a local arena, in bytes.
+    local_arena_initial_size::Int
+end
+
+# Creates a GC configuration.
+function GCConfiguration(;
+    local_arena_count::Integer = 8,
+    root_buffer_capacity::Integer = 256,
+    global_arena_starvation_threshold::Integer = 4 * MiB,
+    global_arena_initial_size::Integer = 2 * MiB,
+    local_arena_starvation_threshold::Integer = 1 * MiB,
+    local_arena_initial_size::Integer = 1 * MiB)
+
+    GCConfiguration(
+        local_arena_count,
+        root_buffer_capacity,
+        global_arena_starvation_threshold,
+        global_arena_initial_size,
+        local_arena_starvation_threshold,
+        local_arena_initial_size)
+end
+
+function initial_heap_size(config::GCConfiguration, thread_count::Integer)
+    warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device()))
+    local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count
+    safepoint_bytesize = sizeof(SafepointState) * warp_count
+    fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
+    rootbuf_bytesize = sizeof(ObjectRef) * config.root_buffer_capacity * thread_count
+
+    result = 0
+    result += local_arenas_bytesize
+    result += safepoint_bytesize
+    result += fingerbuf_bytesize
+    result += rootbuf_bytesize
+    result += config.local_arena_count * config.local_arena_initial_size
+    result += config.global_arena_initial_size
+    return result
+end
+
 # Initializes a GC heap and produces a master record.
 function gc_init!(
     heap::GCHeapDescription,
-    thread_count::Integer;
-    warp_count::Union{Integer, Nothing} = nothing,
-    root_buffer_capacity::Integer = default_root_buffer_capacity,
-    local_arena_count::Integer = 8)::GCMasterRecord
+    config::GCConfiguration,
+    thread_count::Integer)::GCMasterRecord
 
-    if warp_count == nothing
-        warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device()))
-    end
+    warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device()))
 
     master_region = heap.regions[1]
 
@@ -1196,7 +1232,7 @@ function gc_init!(
     gc_memory_end_ptr = master_region.start + master_region.size
 
     # Allocate a local arena pointer buffer.
-    local_arenas_bytesize = sizeof(Ptr{LocalArena}) * local_arena_count
+    local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count
     local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{LocalArena}}, gc_memory_start_ptr)
 
     # Allocate the safepoint flag buffer.
@@ -1206,12 +1242,12 @@ function gc_init!(
     # Allocate root buffers.
     fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
     fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, safepoint_ptr + fingerbuf_bytesize)
-    rootbuf_bytesize = sizeof(ObjectRef) * root_buffer_capacity * thread_count
+    rootbuf_bytesize = sizeof(ObjectRef) * config.root_buffer_capacity * thread_count
     rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, fingerbuf_ptr + fingerbuf_bytesize)
 
     # Populate the root buffer fingers.
     for i in 1:thread_count
-        unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * root_buffer_capacity, i)
+        unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * config.root_buffer_capacity, i)
     end
 
     # Compute a pointer to the start of the tiny arena.
@@ -1226,10 +1262,10 @@ function gc_init!(
     end
 
     # Set up local arenas.
-    for i in 1:local_arena_count
-        local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(local_arena_starvation_threshold))
+    for i in 1:config.local_arena_count
+        local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(config.local_arena_initial_size))
         unsafe_store!(local_arenas_ptr, local_arena, i)
-        arena_start_ptr += local_arena_starvation_threshold
+        arena_start_ptr += config.local_arena_initial_size
     end
 
     # Set up the global arena.
@@ -1238,8 +1274,8 @@ function gc_init!(
     return GCMasterRecord(
         warp_count,
         UInt32(thread_count),
-        root_buffer_capacity,
-        UInt32(local_arena_count),
+        UInt32(config.root_buffer_capacity),
+        UInt32(config.local_arena_count),
         arena_for_ants,
         local_arenas_ptr,
         global_arena,
@@ -1659,7 +1695,7 @@ end
 
 # Collects garbage. This function is designed to be called by the host,
 # not by the device.
-function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, report::GCReport)
+function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, config::GCConfiguration, report::GCReport)
     poll_time = Base.@elapsed begin
         # First off, we have to wait for all warps to reach a safepoint. Clear
         # safepoint flags and wait for warps to set them again.
@@ -1748,9 +1784,9 @@ function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription,
             # limit then we'll expand the GC heap and add the additional memory
             # to the arena's free list.
             threshold = if arena == master_record.global_arena
-                global_arena_starvation_threshold
+                config.global_arena_starvation_threshold
             else
-                local_arena_starvation_threshold
+                config.local_arena_starvation_threshold
             end
 
             if free_memory < threshold
@@ -1822,6 +1858,9 @@ macro cuda_gc(ex...)
     # Get the total number of threads.
     thread_count = get_kwarg_or_default(call_kwargs, :threads, 1)
 
+    # Get the GC configuration.
+    config = get_kwarg_or_default(env_kwargs, :gc_config, GCConfiguration())
+
     # convert the arguments, call the compiler and launch the kernel
     # while keeping the original arguments alive
     push!(code.args,
@@ -1831,11 +1870,14 @@ macro cuda_gc(ex...)
                 local host_interrupt_array = alloc_shared_array((1,), ready)
                 local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array)
 
+                # Evaluate the GC configuration.
+                local gc_config = $(esc(config))
+
                 # Allocate a shared buffer for GC memory.
-                local gc_memory_size = initial_gc_heap_size + sizeof(ObjectRef) * default_root_buffer_capacity * $(esc(thread_count))
+                local gc_memory_size = initial_heap_size(gc_config, $(esc(thread_count)))
                 local gc_heap = GCHeapDescription()
                 expand!(gc_heap, gc_memory_size)
-                local master_record = gc_init!(gc_heap, $(esc(thread_count)))
+                local master_record = gc_init!(gc_heap, gc_config, $(esc(thread_count)))
 
                 # Define a kernel initialization function.
                 local function kernel_init(kernel)
@@ -1866,7 +1908,7 @@ macro cuda_gc(ex...)
 
                 local gc_report = GCReport()
                 local function handle_interrupt()
-                    gc_collect_impl(master_record, gc_heap, gc_report)
+                    gc_collect_impl(master_record, gc_heap, gc_config, gc_report)
                 end
 
                 try

From 757520451924a20de3f8c98bfc820167462c3e0c Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 10 May 2019 11:04:17 +0200
Subject: [PATCH 111/146] Make genetic algo, ssa opt benchmarks quicker

---
 gc-benchmarks/genetic-algorithm.jl | 2 +-
 gc-benchmarks/ssa-opt.jl           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gc-benchmarks/genetic-algorithm.jl b/gc-benchmarks/genetic-algorithm.jl
index b4e3aa8a..6484226d 100644
--- a/gc-benchmarks/genetic-algorithm.jl
+++ b/gc-benchmarks/genetic-algorithm.jl
@@ -131,7 +131,7 @@ function genetic_algo(seed::Int)::Character
     end
 
     # Run the genetic algorithm for a few iterations.
-    for j in 1:10
+    for j in 1:2
         individuals = step(individuals, generator)
     end
 
diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl
index b7c10238..a9a83acd 100644
--- a/gc-benchmarks/ssa-opt.jl
+++ b/gc-benchmarks/ssa-opt.jl
@@ -86,7 +86,7 @@ end
 const thread_count = 256
 
 function kernel()
-    block = create_range_sum_block(50)
+    block = create_range_sum_block(25)
     fold_constants(block)
     return
 end

From fc7273729bbf874cff80f8bd5aa003bbb83e7d7d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 10 May 2019 11:04:58 +0200
Subject: [PATCH 112/146] Try two GC configs when running benchmarks

---
 gc-benchmarks/run-all.jl |  8 +++++---
 gc-benchmarks/utils.jl   | 12 ++++++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index c7bb7083..10d99f09 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -20,13 +20,15 @@ println(results)
 
 # Also write them to a CSV for further analysis.
 open("results.csv", "w") do file
-    write(file, "benchmark,nogc,gc,ratio\n")
+    write(file, "benchmark,nogc,gc,gc-shared,nogc-ratio,gc-ratio,gc-shared-ratio\n")
     for key in sort([k for k in keys(results)])
         runs = results[key]
         median_times = BenchmarkTools.median(runs)
         gc_time = median_times["gc"].time / 1e6
+        gc_shared_time = median_times["gc-shared"].time / 1e6
         nogc_time = median_times["nogc"].time / 1e6
-        ratio = gc_time / nogc_time
-        write(file, "$key,$nogc_time,$gc_time,$ratio\n")
+        gc_ratio = gc_time / nogc_time
+        gc_shared_ratio = gc_shared_time / nogc_time
+        write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,1,$gc_ratio,$gc_shared_ratio\n")
     end
 end
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index aa0df174..701ae891 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -42,7 +42,7 @@ end
 macro cuda_sync(args...)
     esc(quote
         if should_use_gc()
-            CUDAnative.@cuda_gc $(args...)
+            CUDAnative.@cuda_gc gc_config=gc_config $(args...)
         else
             @sync CUDAnative.@cuda $(args...)
         end
@@ -55,11 +55,19 @@ function register_cuda_benchmark(f, name, config)
     suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90
 end
 
+const MiB = 1 << 20
+
 macro cuda_benchmark(name, ex)
     esc(quote
-        suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "nogc"])
+        suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "gc-shared", "nogc"])
         register_cuda_benchmark($name, "gc") do
             global use_gc = true
+            global gc_config = GCConfiguration(local_arena_count=8, local_arena_initial_size=MiB, global_arena_initial_size=2 * MiB)
+            $(ex)
+        end
+        register_cuda_benchmark($name, "gc-shared") do
+            global use_gc = true
+            global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=10 * MiB)
             $(ex)
         end
         register_cuda_benchmark($name, "nogc") do

From 573d580adc5cf511e5041b078de23f0aaedc1773 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 10 May 2019 11:23:20 +0200
Subject: [PATCH 113/146] Fold '@cuda_gc' into '@cuda'

---
 examples/binary-tree.jl  |  12 ++--
 examples/gc.jl           |   2 +-
 examples/linked-list.jl  |   6 +-
 examples/matrix.jl       |   4 +-
 examples/stdlib-array.jl |   2 +-
 gc-benchmarks/utils.jl   |   2 +-
 src/execution.jl         | 104 +++++++++++++++++++++++++++++-----
 src/gc.jl                | 117 +--------------------------------------
 test/device/gc.jl        |   4 +-
 9 files changed, 109 insertions(+), 144 deletions(-)

diff --git a/examples/binary-tree.jl b/examples/binary-tree.jl
index 46db7d38..812af535 100644
--- a/examples/binary-tree.jl
+++ b/examples/binary-tree.jl
@@ -9,7 +9,7 @@ import Base: haskey, insert!
 # The main point of this example is to demonstrate that even
 # naive, pointer-chasing programs can be compiled to GPU kernels.
 
-const use_gc = true
+const use_gc = false
 
 """A binary search tree node."""
 abstract type BinarySearchTreeNode{T} end
@@ -136,6 +136,8 @@ function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64})
     return
 end
 
+ccall((:ha_init_bytes, "/media/jonathan/Quark/School/CUDAnative.jl/libhalloc"), Cvoid, (Csize_t,), Csize_t(256 * 1024 * 1024))
+
 # Generate a sequence of 64-bit truncated Fibonacci numbers.
 number_set = fibonacci(Int64, number_count)
 # Randomize the sequence's order.
@@ -156,18 +158,18 @@ Mem.upload!(destination_array, test_sequence)
 
 if use_gc
     # Run the kernel.
-    @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+    @cuda gc=true malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer)
 
     # Run it again.
     Mem.upload!(destination_array, test_sequence)
-    stats = @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+    stats = @cuda gc=true malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer)
 else
     # Run the kernel.
-    @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+    @cuda malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer)
 
     # Run it again and time it this time.
     Mem.upload!(destination_array, test_sequence)
-    stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+    stats = CUDAdrv.@elapsed @cuda malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer)
 end
 println(stats)
 
diff --git a/examples/gc.jl b/examples/gc.jl
index 51fe758e..6e81bfb2 100644
--- a/examples/gc.jl
+++ b/examples/gc.jl
@@ -42,6 +42,6 @@ Mem.upload!(source_array, fill(42.f0, thread_count))
 Mem.upload!(destination_array, zeros(Float32, thread_count))
 
 # Run the kernel.
-@cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+@cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)
 
 @test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count)
diff --git a/examples/linked-list.jl b/examples/linked-list.jl
index 2c7e949c..8e2c7f3a 100644
--- a/examples/linked-list.jl
+++ b/examples/linked-list.jl
@@ -45,7 +45,7 @@ function sum(list::List{T}) where T
     reduce(+, list; init=zero(T))
 end
 
-const element_count = 1000
+const element_count = 2000
 const thread_count = 32
 
 function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64})
@@ -67,8 +67,8 @@ Mem.upload!(destination_array, zeros(Int64, thread_count))
 
 # Run the kernel.
 if use_gc
-    @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
-    stats = @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+    @cuda gc=true threads=thread_count gc_config=GCConfiguration(; global_arena_initial_size=1024, global_arena_starvation_threshold=1024) kernel(source_pointer, destination_pointer)
+    stats = @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)
 else
     @cuda threads=thread_count kernel(source_pointer, destination_pointer)
     stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer)
diff --git a/examples/matrix.jl b/examples/matrix.jl
index 277aacd1..69fa73d8 100644
--- a/examples/matrix.jl
+++ b/examples/matrix.jl
@@ -121,9 +121,9 @@ destination_array = Mem.alloc(Int64, thread_count)
 destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
 
 if use_gc
-    time = @cuda_gc threads=thread_count kernel(destination_pointer)
+    time = @cuda gc=true threads=thread_count kernel(destination_pointer)
     println(time)
-    time = @cuda_gc threads=thread_count kernel(destination_pointer)
+    time = @cuda gc=true threads=thread_count kernel(destination_pointer)
     println(time)
 else
     time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer)
diff --git a/examples/stdlib-array.jl b/examples/stdlib-array.jl
index b5b17cc2..157a468f 100644
--- a/examples/stdlib-array.jl
+++ b/examples/stdlib-array.jl
@@ -17,4 +17,4 @@ function kernel()
     return
 end
 
-@cuda_gc threads=thread_count kernel()
+@cuda gc=true threads=thread_count kernel()
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 701ae891..73d359ed 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -42,7 +42,7 @@ end
 macro cuda_sync(args...)
     esc(quote
         if should_use_gc()
-            CUDAnative.@cuda_gc gc_config=gc_config $(args...)
+            CUDAnative.@cuda gc=true gc_config=gc_config $(args...)
         else
             @sync CUDAnative.@cuda $(args...)
         end
diff --git a/src/execution.jl b/src/execution.jl
index fc930c7c..4c1337f6 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -9,7 +9,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
 # the code it generates, or the execution
 function split_kwargs(kwargs)
     macro_kws    = [:dynamic, :init, :gc_config]
-    compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :malloc]
+    compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :malloc, :gc]
     call_kws     = [:cooperative, :blocks, :threads, :shmem, :stream]
     macro_kwargs = []
     compiler_kwargs = []
@@ -90,6 +90,9 @@ performed, scheduling a kernel launch on the current CUDA context.
 
 Several keyword arguments are supported that influence the behavior of `@cuda`.
 - `dynamic`: use dynamic parallelism to launch device-side kernels
+- `gc`: set up a GC and use it to allocate memory; cannot be combined with `dynamic`
+- `gc_config`: the GC configuration to use if `gc=true`; see [`GCConfiguration`](@ref)
+- `malloc`: the name of the allocation function to use, if `gc` is not in use
 - arguments that influence kernel compilation: see [`cufunction`](@ref) and
   [`dynamic_cufunction`](@ref)
 - arguments that influence kernel launch: see [`CUDAnative.HostKernel`](@ref) and
@@ -133,21 +136,15 @@ macro cuda(ex...)
     args = call.args[2:end]
 
     code = quote end
-    macro_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs)
+    env_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs)
     vars, var_exprs = assign_args!(code, args)
 
     # handle keyword arguments that influence the macro's behavior
-    dynamic = false
-    env_kwargs = []
-    for kwarg in macro_kwargs
-        key,val = kwarg.args
-        if key == :dynamic
-            isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value"))
-            dynamic = val::Bool
-        else
-            push!(env_kwargs, kwarg)
-        end
-    end
+    dynamic = get_kwarg_or_default(env_kwargs, :dynamic, false)
+    isa(dynamic, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant Boolean"))
+
+    gc = get_kwarg_or_default(compiler_kwargs, :gc, false)
+    isa(gc, Bool) || throw(ArgumentError("`gc` keyword argument to @cuda should be a constant Boolean"))
 
     if dynamic
         # FIXME: we could probably somehow support kwargs with constant values by either
@@ -155,6 +152,9 @@ macro cuda(ex...)
         #        IR when processing the dynamic parallelism marker
         isempty(compiler_kwargs) || error("@cuda dynamic parallelism does not support compiler keyword arguments")
 
+        # FIXME: update the GC to support dynamic parallelism somehow.
+        !gc || error("@cuda does not support both `gc=true` and `dynamic=true`")
+
         # dynamic, device-side kernel launch
         push!(code.args,
             quote
@@ -164,6 +164,84 @@ macro cuda(ex...)
                 prepare_kernel(kernel; $(map(esc, env_kwargs)...))
                 kernel($(var_exprs...); $(map(esc, call_kwargs)...))
              end)
+    elseif gc
+        # Find the stream on which the kernel is to be scheduled.
+        stream = get_kwarg_or_default(call_kwargs, :stream, CuDefaultStream())
+
+        # Get the total number of threads.
+        thread_count = get_kwarg_or_default(call_kwargs, :threads, 1)
+
+        # Get the GC configuration.
+        config = get_kwarg_or_default(env_kwargs, :gc_config, GCConfiguration())
+
+        # GC-enabled host-side launch.
+        push!(code.args,
+            quote
+                GC.@preserve $(vars...) begin
+                    # Define a trivial buffer that contains the interrupt state.
+                    local host_interrupt_array = alloc_shared_array((1,), ready)
+                    local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array)
+
+                    # Evaluate the GC configuration.
+                    local gc_config = $(esc(config))
+
+                    # Allocate a shared buffer for GC memory.
+                    local gc_memory_size = initial_heap_size(gc_config, $(esc(thread_count)))
+                    local gc_heap = GCHeapDescription()
+                    expand!(gc_heap, gc_memory_size)
+                    local master_record = gc_init!(gc_heap, gc_config, $(esc(thread_count)))
+
+                    # Define a kernel initialization function.
+                    local function kernel_init(kernel)
+                        # Set the interrupt state pointer.
+                        try
+                            global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
+                            set(global_handle, CuPtr{UInt32}(device_interrupt_buffer.ptr))
+                        catch exception
+                            # The interrupt pointer may not have been declared (because it is unused).
+                            # In that case, we should do nothing.
+                            if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+                                rethrow()
+                            end
+                        end
+
+                        # Set the GC master record.
+                        try
+                            global_handle = CuGlobal{GCMasterRecord}(kernel.mod, "gc_master_record")
+                            set(global_handle, master_record)
+                        catch exception
+                            # The GC info pointer may not have been declared (because it is unused).
+                            # In that case, we should do nothing.
+                            if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+                                rethrow()
+                            end
+                        end
+                    end
+
+                    local gc_report = GCReport()
+                    local function handle_interrupt()
+                        gc_collect_impl(master_record, gc_heap, gc_config, gc_report)
+                    end
+
+                    try
+                        # Standard kernel setup logic.
+                        local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
+                        local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
+                        local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; malloc="ptx_gc_malloc", $(map(esc, compiler_kwargs)...))
+                        CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...))
+                        gc_report.elapsed_time = Base.@elapsed begin
+                            kernel(kernel_args...; $(map(esc, call_kwargs)...))
+
+                            # Handle interrupts.
+                            handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream)))
+                        end
+                    finally
+                        free_shared_array(host_interrupt_array)
+                        free!(gc_heap)
+                    end
+                    gc_report
+                end
+            end)
     else
         # regular, host-side kernel launch
         #
diff --git a/src/gc.jl b/src/gc.jl
index a116a687..2c3763f5 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -43,7 +43,7 @@
 #   * When the device runs out of GC memory, it requests an interrupt
 #     to mark and sweep.
 
-export @cuda_gc, gc_malloc, gc_malloc_object, gc_collect, gc_safepoint, GCConfiguration
+export gc_malloc, gc_malloc_object, gc_collect, gc_safepoint, GCConfiguration
 
 import Base: length, show
 import Printf: @sprintf
@@ -1817,118 +1817,3 @@ function get_kwarg_or_default(kwarg_list, key::Symbol, default)
     end
     return default
 end
-
-"""
-    @cuda_gc [kwargs...] func(args...)
-
-High-level interface for executing code on a GPU with GC support.
-The `@cuda_gc` macro should prefix a call, with `func` a callable function
-or object that should return nothing. It will be compiled to a CUDA function upon first
-use, and to a certain extent arguments will be converted and managed automatically using
-`cudaconvert`. Next, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel
-launch on the current CUDA context. Finally, `@cuda_gc` waits for the kernel to finish,
-performing garbage collection in the meantime if necessary.
-
-Several keyword arguments are supported that influence kernel compilation and execution. For
-more information, refer to the documentation of respectively [`cufunction`](@ref) and
-[`CUDAnative.Kernel`](@ref).
-"""
-macro cuda_gc(ex...)
-    # destructure the `@cuda_gc` expression
-    if length(ex) > 0 && ex[1].head == :tuple
-        error("The tuple argument to @cuda has been replaced by keywords: `@cuda_gc threads=... fun(args...)`")
-    end
-    call = ex[end]
-    kwargs = ex[1:end-1]
-
-    # destructure the kernel call
-    if call.head != :call
-        throw(ArgumentError("second argument to @cuda_gc should be a function call"))
-    end
-    f = call.args[1]
-    args = call.args[2:end]
-
-    code = quote end
-    env_kwargs, compiler_kwargs, call_kwargs = CUDAnative.split_kwargs(kwargs)
-    vars, var_exprs = CUDAnative.assign_args!(code, args)
-
-    # Find the stream on which the kernel is to be scheduled.
-    stream = get_kwarg_or_default(call_kwargs, :stream, CuDefaultStream())
-
-    # Get the total number of threads.
-    thread_count = get_kwarg_or_default(call_kwargs, :threads, 1)
-
-    # Get the GC configuration.
-    config = get_kwarg_or_default(env_kwargs, :gc_config, GCConfiguration())
-
-    # convert the arguments, call the compiler and launch the kernel
-    # while keeping the original arguments alive
-    push!(code.args,
-        quote
-            GC.@preserve $(vars...) begin
-                # Define a trivial buffer that contains the interrupt state.
-                local host_interrupt_array = alloc_shared_array((1,), ready)
-                local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array)
-
-                # Evaluate the GC configuration.
-                local gc_config = $(esc(config))
-
-                # Allocate a shared buffer for GC memory.
-                local gc_memory_size = initial_heap_size(gc_config, $(esc(thread_count)))
-                local gc_heap = GCHeapDescription()
-                expand!(gc_heap, gc_memory_size)
-                local master_record = gc_init!(gc_heap, gc_config, $(esc(thread_count)))
-
-                # Define a kernel initialization function.
-                local function kernel_init(kernel)
-                    # Set the interrupt state pointer.
-                    try
-                        global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
-                        set(global_handle, CuPtr{UInt32}(device_interrupt_buffer.ptr))
-                    catch exception
-                        # The interrupt pointer may not have been declared (because it is unused).
-                        # In that case, we should do nothing.
-                        if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
-                            rethrow()
-                        end
-                    end
-
-                    # Set the GC master record.
-                    try
-                        global_handle = CuGlobal{GCMasterRecord}(kernel.mod, "gc_master_record")
-                        set(global_handle, master_record)
-                    catch exception
-                        # The GC info pointer may not have been declared (because it is unused).
-                        # In that case, we should do nothing.
-                        if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
-                            rethrow()
-                        end
-                    end
-                end
-
-                local gc_report = GCReport()
-                local function handle_interrupt()
-                    gc_collect_impl(master_record, gc_heap, gc_config, gc_report)
-                end
-
-                try
-                    # Standard kernel setup logic.
-                    local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
-                    local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-                    local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; gc = true, malloc="ptx_gc_malloc", $(map(esc, compiler_kwargs)...))
-                    CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...))
-                    gc_report.elapsed_time = Base.@elapsed begin
-                        kernel(kernel_args...; $(map(esc, call_kwargs)...))
-
-                        # Handle interrupts.
-                        handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream)))
-                    end
-                finally
-                    free_shared_array(host_interrupt_array)
-                    free!(gc_heap)
-                end
-                gc_report
-            end
-         end)
-    return code
-end
diff --git a/test/device/gc.jl b/test/device/gc.jl
index 1ec9b0fc..640d5ebf 100644
--- a/test/device/gc.jl
+++ b/test/device/gc.jl
@@ -6,7 +6,7 @@ dummy() = return
 
 dummy_handler(kernel) = return
 
-@testset "@cuda_gc" begin
+@testset "@cuda gc=true" begin
 
 @testset "allocate and collect" begin
     # This test allocates many very small and very large objects. Both the small
@@ -60,7 +60,7 @@ dummy_handler(kernel) = return
     Mem.upload!(destination_array, zeros(Float32, thread_count))
 
     # Run the kernel.
-    @cuda_gc threads=thread_count kernel(source_pointer, destination_pointer)
+    @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)
 
     @test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count)
 end

From 9ac081d40349f1f577d63355e1e4d397e4b5ca41 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 10 May 2019 12:26:08 +0200
Subject: [PATCH 114/146] Reuse pinned memory support from CUDAdrv

---
 src/execution.jl  | 11 +++++-----
 src/gc.jl         | 24 +++++++--------------
 src/interrupts.jl | 53 ++++++-----------------------------------------
 3 files changed, 19 insertions(+), 69 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index 4c1337f6..374a232c 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -179,8 +179,9 @@ macro cuda(ex...)
             quote
                 GC.@preserve $(vars...) begin
                     # Define a trivial buffer that contains the interrupt state.
-                    local host_interrupt_array = alloc_shared_array((1,), ready)
-                    local device_interrupt_buffer = get_shared_device_buffer(host_interrupt_array)
+                    local interrupt_buffer = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, sizeof(ready), CUDAdrv.Mem.HOSTALLOC_DEVICEMAP)
+                    unsafe_store!(Base.unsafe_convert(Ptr{UInt32}, interrupt_buffer), ready)
+                    local device_interrupt_pointer = Base.unsafe_convert(CuPtr{UInt32}, interrupt_buffer)
 
                     # Evaluate the GC configuration.
                     local gc_config = $(esc(config))
@@ -196,7 +197,7 @@ macro cuda(ex...)
                         # Set the interrupt state pointer.
                         try
                             global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
-                            set(global_handle, CuPtr{UInt32}(device_interrupt_buffer.ptr))
+                            set(global_handle, device_interrupt_pointer)
                         catch exception
                             # The interrupt pointer may not have been declared (because it is unused).
                             # In that case, we should do nothing.
@@ -233,10 +234,10 @@ macro cuda(ex...)
                             kernel(kernel_args...; $(map(esc, call_kwargs)...))
 
                             # Handle interrupts.
-                            handle_interrupts(handle_interrupt, pointer(host_interrupt_array, 1), $(esc(stream)))
+                            handle_interrupts(handle_interrupt, pointer(interrupt_buffer), $(esc(stream)))
                         end
                     finally
-                        free_shared_array(host_interrupt_array)
+                        CUDAdrv.Mem.free(interrupt_buffer)
                         free!(gc_heap)
                     end
                     gc_report
diff --git a/src/gc.jl b/src/gc.jl
index 2c3763f5..7bc4f0b8 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -1135,16 +1135,7 @@ const MiB = 1 << 20
 const tiny_arena_starvation_threshold = 0 # 2 * MiB
 
 # A description of a region of memory that has been allocated to the GC heap.
-struct GCHeapRegion
-    # A buffer that contains the GC region's bytes.
-    buffer::Array{UInt8, 1}
-    # A pointer to the first element in the region.
-    start::Ptr{UInt8}
-    # The region's size in bytes.
-    size::Csize_t
-end
-
-GCHeapRegion(buffer::Array{UInt8, 1}) = GCHeapRegion(buffer, pointer(buffer, 1), Csize_t(length(buffer)))
+const GCHeapRegion = CUDAdrv.Mem.HostBuffer
 
 # A description of all memory that has been allocated to the GC heap.
 struct GCHeapDescription
@@ -1228,8 +1219,8 @@ function gc_init!(
 
     master_region = heap.regions[1]
 
-    gc_memory_start_ptr = master_region.start
-    gc_memory_end_ptr = master_region.start + master_region.size
+    gc_memory_start_ptr = pointer(master_region)
+    gc_memory_end_ptr = pointer(master_region) + sizeof(master_region)
 
     # Allocate a local arena pointer buffer.
     local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count
@@ -1397,7 +1388,7 @@ end
 # Tells if a GC heap contains a particular pointer.
 function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T
     for region in heap.regions
-        if pointer >= region.start && pointer < region.start + region.size
+        if pointer >= pointer(region) && pointer < pointer(region) + sizeof(region)
             return true
         end
     end
@@ -1408,8 +1399,7 @@ end
 # the list of allocated regions. `size` describes the amount of bytes to
 # allocate. Returns the allocated region.
 function expand!(heap::GCHeapDescription, size::Integer)::GCHeapRegion
-    buffer = alloc_shared_array((size,), UInt8(0))
-    region = GCHeapRegion(buffer)
+    region = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, size, CUDAdrv.Mem.HOSTALLOC_DEVICEMAP)
     push!(heap.regions, region)
     return region
 end
@@ -1417,7 +1407,7 @@ end
 # Frees all memory allocated by a GC heap.
 function free!(heap::GCHeapDescription)
     for region in heap.regions
-        free_shared_array(region.buffer)
+        CUDAdrv.Mem.free(region)
     end
 end
 
@@ -1646,7 +1636,7 @@ end
 
 # Expands a GC arena by assigning it an additional heap region.
 function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion)
-    extra_record = make_gc_block!(region.start, region.size)
+    extra_record = make_gc_block!(pointer(region), sizeof(region))
     last_free_list_ptr = @get_field_pointer(arena, :free_list_head)
     iterate_free(arena) do record
         last_free_list_ptr = @get_field_pointer(record, :next)
diff --git a/src/interrupts.jl b/src/interrupts.jl
index de7cc7cb..fb3076dd 100644
--- a/src/interrupts.jl
+++ b/src/interrupts.jl
@@ -9,48 +9,6 @@ import CUDAdrv: @apicall
 
 export @cuda_interruptible, interrupt, interrupt_or_wait, wait_for_interrupt
 
-# Allocates an array of host memory that is page-locked and accessible
-# to the device. Maps the allocation into the CUDA address space.
-# Returns a host array that can be turned into a device array by calling
-# the `get_shared_device_buffer` function.
-function alloc_shared_array(dims::Tuple{Vararg{Int64, N}}, init::T)::Array{T, N} where {T, N}
-    # Allocate memory that is accessible to both the host and the device.
-    bytesize = prod(dims) * sizeof(T)
-    ptr_ref = Ref{Ptr{Cvoid}}()
-    @apicall(
-        :cuMemAllocHost,
-        (Ptr{Ptr{Cvoid}}, Csize_t),
-        ptr_ref, bytesize)
-
-    # Wrap the memory in an array.
-    host_array = Base.unsafe_wrap(Array{T, N}, Ptr{T}(ptr_ref[]), dims; own = false)
-
-    # Initialize the array's contents.
-    fill!(host_array, init)
-
-    return host_array
-end
-
-# Gets the device array that corresponds to a shared host array.
-# NOTE: this function only works for arrays that were allocated by
-# `alloc_shared_array`. It has undefined behavior for all other arrays.
-function get_shared_device_buffer(shared_array::Array{T, N})::Mem.Buffer where {T, N}
-    bytesize = length(shared_array) * sizeof(T)
-    CUDAdrv.Mem.Buffer(
-        convert(CuPtr{T}, convert(Csize_t, pointer(shared_array, 1))),
-        bytesize,
-        CuCurrentContext())
-end
-
-# Frees an array of host memory.
-function free_shared_array(shared_array::Array{T, N}) where {T, N}
-    ptr = pointer(shared_array, 1)
-    @apicall(
-        :cuMemFreeHost,
-        (Ptr{Cvoid},),
-        ptr)
-end
-
 # Queries a stream for its status.
 function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint
     return ccall(
@@ -255,8 +213,9 @@ macro cuda_interruptible(handler, ex...)
         quote
             GC.@preserve $(vars...) begin
                 # Define a trivial buffer that contains the interrupt state.
-                local host_array = alloc_shared_array((1,), ready)
-                local device_buffer = get_shared_device_buffer(host_array)
+                local interrupt_buffer = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, sizeof(ready), CUDAdrv.Mem.HOSTALLOC_DEVICEMAP)
+                unsafe_store!(Base.unsafe_convert(Ptr{UInt32}, interrupt_buffer), ready)
+                local device_interrupt_pointer = Base.unsafe_convert(CuPtr{UInt32}, interrupt_buffer)
 
                 try
                     # Define a kernel initialization function that sets the
@@ -264,7 +223,7 @@ macro cuda_interruptible(handler, ex...)
                     local function interrupt_kernel_init(kernel)
                         try
                             global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
-                            set(global_handle, CuPtr{UInt32}(device_buffer.ptr))
+                            set(global_handle, device_interrupt_pointer)
                         catch exception
                             # The interrupt pointer may not have been declared (because it is unused).
                             # In that case, we should do nothing.
@@ -282,9 +241,9 @@ macro cuda_interruptible(handler, ex...)
                     kernel(kernel_args...; $(map(esc, call_kwargs)...))
 
                     # Handle interrupts.
-                    handle_interrupts($(esc(handler)), pointer(host_array, 1), $(esc(stream)))
+                    handle_interrupts($(esc(handler)), pointer(interrupt_buffer), $(esc(stream)))
                 finally
-                    free_shared_array(host_array)
+                    CUDAdrv.Mem.free(interrupt_buffer)
                 end
             end
          end)

From 61818e8b1fd9606d21e6ab91b04faad395806520 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 10 May 2019 15:18:40 +0200
Subject: [PATCH 115/146] Handle multi-dimensional 'thread' args gracefully

---
 src/execution.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index 8f3816c0..bb66f4c5 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -188,10 +188,10 @@ macro cuda(ex...)
                     local gc_config = $(esc(config))
 
                     # Allocate a shared buffer for GC memory.
-                    local gc_memory_size = initial_heap_size(gc_config, $(esc(thread_count)))
+                    local gc_memory_size = initial_heap_size(gc_config, prod($(esc(thread_count))))
                     local gc_heap = GCHeapDescription()
                     expand!(gc_heap, gc_memory_size)
-                    local master_record = gc_init!(gc_heap, gc_config, $(esc(thread_count)))
+                    local master_record = gc_init!(gc_heap, gc_config, prod($(esc(thread_count))))
 
                     # Define a kernel initialization function.
                     local function kernel_init(kernel)

From 35a3652e19475129c75382747ded9c02099ea5da Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 23 May 2019 11:57:03 +0200
Subject: [PATCH 116/146] Define 'upload!', 'download' benchmark utils

---
 gc-benchmarks/utils.jl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 73d359ed..83cfacef 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -106,3 +106,13 @@ function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::I
 end
 
 end
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end

From 09edf89d94ac314155096ef2f5f8fa51b9ccd5a7 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 23 May 2019 12:47:49 +0200
Subject: [PATCH 117/146] Implement a bump allocator for kernels

---
 src/device/runtime.jl | 44 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index df3b3d13..94b1abcd 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -12,6 +12,7 @@ module Runtime
 using ..CUDAnative
 using LLVM
 using LLVM.Interop
+using CUDAdrv
 
 import ..CUDAnative: GCFrame
 ## representation of a runtime method instance
@@ -286,6 +287,49 @@ compile(
 compile(CUDAnative.gc_safepoint, Cvoid, ())
 compile(CUDAnative.gc_perma_safepoint, Cvoid, ())
 
+## Bump allocator.
+
+# Allocates `bytesize` bytes of storage by bumping the global bump
+# allocator pointer.
+function bump_alloc(bytesize::Csize_t)::Ptr{UInt8}
+    ptr = CUDAnative.@cuda_global_ptr("bump_alloc_ptr", Csize_t)
+    chunk_address = CUDAnative.atomic_add!(ptr, bytesize)
+    end_ptr = unsafe_load(CUDAnative.@cuda_global_ptr("bump_alloc_end", Csize_t))
+    if chunk_address < end_ptr
+        return Ptr{UInt8}(chunk_address)
+    else
+        return C_NULL
+    end
+end
+
+compile(bump_alloc, Ptr{UInt8}, (Csize_t,))
+
+function maybe_set_global(kernel, name, value::T) where T
+    try
+        global_handle = CuGlobal{T}(kernel.mod, name)
+        set(global_handle, value)
+    catch exception
+        # The interrupt pointer may not have been declared (because it is unused).
+        # In that case, we should do nothing.
+        if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+            rethrow()
+        end
+    end
+end
+
+function bump_alloc_init!(kernel, capacity)
+    buf = Mem.alloc(Mem.DeviceBuffer, capacity)
+    start_address = pointer(buf)
+    end_address = start_address + capacity
+    maybe_set_global(kernel, "bump_alloc_ptr", start_address)
+    maybe_set_global(kernel, "bump_alloc_end", end_address)
+    return start_address
+end
+
+function bump_alloc_finalize!(kernel, ptr)
+    Mem.free(ptr)
+end
+
 ## Arrays
 
 # A data structure that carefully mirrors an in-memory array control

From 60d6fc6de211cedce7ff18ace5b0305d0e4431b7 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 23 May 2019 12:49:34 +0200
Subject: [PATCH 118/146] Add a bump allocator to the GC benchmark configs

---
 gc-benchmarks/utils.jl | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 83cfacef..f3db300e 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -1,10 +1,10 @@
 import BenchmarkTools
 
-function should_use_gc()
+function get_gc_mode()
     try
-        return use_gc
+        return gc_mode
     catch ex
-        return true
+        return "gc"
     end
 end
 
@@ -41,8 +41,11 @@ end
 
 macro cuda_sync(args...)
     esc(quote
-        if should_use_gc()
+        local mode = get_gc_mode()
+        if mode == "gc"
             CUDAnative.@cuda gc=true gc_config=gc_config $(args...)
+        elseif mode == "bump"
+            @sync CUDAnative.@cuda init=(k -> CUDAnative.Runtime.bump_alloc_init!(k, 60 * MiB)) malloc="ptx_bump_alloc" $(args...)
         else
             @sync CUDAnative.@cuda $(args...)
         end
@@ -52,26 +55,30 @@ end
 suite = BenchmarkTools.BenchmarkGroup()
 
 function register_cuda_benchmark(f, name, config)
-    suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90
+    suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1
 end
 
 const MiB = 1 << 20
 
 macro cuda_benchmark(name, ex)
     esc(quote
-        suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "gc-shared", "nogc"])
+        suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "gc-shared", "nogc", "bump"])
         register_cuda_benchmark($name, "gc") do
-            global use_gc = true
+            global gc_mode = "gc"
             global gc_config = GCConfiguration(local_arena_count=8, local_arena_initial_size=MiB, global_arena_initial_size=2 * MiB)
             $(ex)
         end
         register_cuda_benchmark($name, "gc-shared") do
-            global use_gc = true
+            global gc_mode = "gc"
             global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=10 * MiB)
             $(ex)
         end
         register_cuda_benchmark($name, "nogc") do
-            global use_gc = false
+            global gc_mode = "nogc"
+            $(ex)
+        end
+        register_cuda_benchmark($name, "bump") do
+            global gc_mode = "bump"
             $(ex)
         end
     end)

From 1805d7fa6aeb2025ffa020e8f80675ffdb422d2e Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 23 May 2019 14:13:25 +0200
Subject: [PATCH 119/146] Use 'managed_malloc' to implement 'gc_pool_alloc'

---
 src/device/runtime.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 94b1abcd..8df26f9c 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -156,7 +156,7 @@ end
 end
 
 function gc_pool_alloc(sz::Csize_t)
-    ptr = malloc(sz)
+    ptr = managed_malloc(sz)
     if ptr == C_NULL
         @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
         throw(OutOfMemoryError())

From d99ed4a87e7b088324dedfdf1e78d0e40e520cd6 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 23 May 2019 14:21:49 +0200
Subject: [PATCH 120/146] Update test runner to write bump allocator results

---
 gc-benchmarks/run-all.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 10d99f09..5a12b676 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -20,15 +20,17 @@ println(results)
 
 # Also write them to a CSV for further analysis.
 open("results.csv", "w") do file
-    write(file, "benchmark,nogc,gc,gc-shared,nogc-ratio,gc-ratio,gc-shared-ratio\n")
+    write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n")
     for key in sort([k for k in keys(results)])
         runs = results[key]
         median_times = BenchmarkTools.median(runs)
         gc_time = median_times["gc"].time / 1e6
         gc_shared_time = median_times["gc-shared"].time / 1e6
         nogc_time = median_times["nogc"].time / 1e6
+        bump_time = median_times["bump"].time / 1e6
         gc_ratio = gc_time / nogc_time
         gc_shared_ratio = gc_shared_time / nogc_time
-        write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,1,$gc_ratio,$gc_shared_ratio\n")
+        bump_ratio = bump_time / nogc_time
+        write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,$bump_time,1,$gc_ratio,$gc_shared_ratio,$bump_ratio\n")
     end
 end

From c1356b1a16e35a668a58562622210505b87ff140 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 23 May 2019 14:54:50 +0200
Subject: [PATCH 121/146] Change how bump allocators are initialized

---
 gc-benchmarks/utils.jl |  9 ++++++++-
 src/device/runtime.jl  | 14 +++-----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index f3db300e..eca5b0cf 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -45,7 +45,14 @@ macro cuda_sync(args...)
         if mode == "gc"
             CUDAnative.@cuda gc=true gc_config=gc_config $(args...)
         elseif mode == "bump"
-            @sync CUDAnative.@cuda init=(k -> CUDAnative.Runtime.bump_alloc_init!(k, 60 * MiB)) malloc="ptx_bump_alloc" $(args...)
+            local capacity = 60 * MiB
+            local buf = Mem.alloc(Mem.DeviceBuffer, capacity)
+            local start_address = pointer(buf)
+            local function init(kernel)
+                CUDAnative.Runtime.bump_alloc_init!(kernel, start_address, capacity)
+            end
+            @sync CUDAnative.@cuda init=init malloc="ptx_bump_alloc" $(args...)
+            Mem.free(buf)
         else
             @sync CUDAnative.@cuda $(args...)
         end
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 8df26f9c..62899ab1 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -317,17 +317,9 @@ function maybe_set_global(kernel, name, value::T) where T
     end
 end
 
-function bump_alloc_init!(kernel, capacity)
-    buf = Mem.alloc(Mem.DeviceBuffer, capacity)
-    start_address = pointer(buf)
-    end_address = start_address + capacity
-    maybe_set_global(kernel, "bump_alloc_ptr", start_address)
-    maybe_set_global(kernel, "bump_alloc_end", end_address)
-    return start_address
-end
-
-function bump_alloc_finalize!(kernel, ptr)
-    Mem.free(ptr)
+function bump_alloc_init!(kernel, buffer_start, buffer_size)
+    maybe_set_global(kernel, "bump_alloc_ptr", buffer_start)
+    maybe_set_global(kernel, "bump_alloc_end", buffer_start + buffer_size)
 end
 
 ## Arrays

From 2cdaf68eee52f86fc0d6e51110643934c46abe2c Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 6 Jun 2019 13:54:19 +0200
Subject: [PATCH 122/146] Implement jl_array_sizehint

---
 src/compiler/optim.jl |  2 +-
 src/device/runtime.jl | 37 +++++++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index d7691907..0f42a2af 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -959,7 +959,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
                 end
             end
             changed_any = true
-        elseif name == :jl_array_grow_end
+        elseif name in [:jl_array_grow_end, :jl_array_sizehint]
             let builder = Builder(JuliaContext())
                 position!(builder, call)
                 new_call = call!(builder, Runtime.get(name), args)
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 62899ab1..8f289e6d 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -406,7 +406,12 @@ function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool
     return true
 end
 
-function jl_array_grow_at_end(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t)
+"""
+    jl_array_grow_at(a, idx, inc, n)
+
+Grows array `a` containing `n` elements by `inc` elements at index `idx`.
+"""
+function jl_array_grow_at(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t)
     data = a.data
     elsz = Csize_t(a.elsize)
     reqmaxsize = a.offset + n + inc
@@ -445,7 +450,7 @@ end
 
 function jl_array_grow_end(a::Array1D, inc::Csize_t)
     n = a.nrows
-    jl_array_grow_at_end(a, n, inc, n)
+    jl_array_grow_at(a, n, inc, n)
     return
 end
 
@@ -456,4 +461,32 @@ compile(
     () -> convert(LLVMType, Cvoid),
     () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
 
+"""
+    jl_array_sizehint(a, sz)
+
+Suggest that collection `a` reserve capacity for at least `sz` elements.
+"""
+function jl_array_sizehint(a::Array1D, sz::Csize_t)
+    n = a.length
+    data = a.data
+    elsz = Csize_t(a.elsize)
+    reqmaxsize = a.offset + sz
+    if reqmaxsize > a.maxsize
+        newbuf = array_resize_buffer(a, reqmaxsize)
+        newdata = a.data + a.offset * elsz
+        if newbuf
+            memmove!(newdata, data, n * elsz)
+        end
+        a.data = data = newdata
+    end
+    return
+end
+
+compile(
+    jl_array_sizehint,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
 end

From fc809757e502992aff5dbe47327c84513b0fb6c9 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 6 Jun 2019 14:36:57 +0200
Subject: [PATCH 123/146] Implement jl_array_grow_at

---
 src/compiler/optim.jl |  2 +-
 src/device/runtime.jl | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 0f42a2af..788f3912 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -959,7 +959,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
                 end
             end
             changed_any = true
-        elseif name in [:jl_array_grow_end, :jl_array_sizehint]
+        elseif name in [:jl_array_grow_end, :jl_array_grow_at, :jl_array_sizehint]
             let builder = Builder(JuliaContext())
                 position!(builder, call)
                 new_call = call!(builder, Runtime.get(name), args)
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 8f289e6d..f0130968 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -407,19 +407,18 @@ function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool
 end
 
 """
-    jl_array_grow_at(a, idx, inc, n)
+    jl_array_grow_at_impl(a, idx, inc, n)
 
 Grows array `a` containing `n` elements by `inc` elements at index `idx`.
 """
-function jl_array_grow_at(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t)
+function jl_array_grow_at_impl(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t)
     data = a.data
     elsz = Csize_t(a.elsize)
     reqmaxsize = a.offset + n + inc
     has_gap = n > idx
+    nb1 = idx * elsz
+    nbinc = inc * elsz
     if reqmaxsize > a.maxsize
-        nb1 = idx * elsz
-        nbinc = inc * elsz
-
         if reqmaxsize < 4
             newmaxsize = Csize_t(4)
         elseif reqmaxsize >= a.maxsize * 2
@@ -439,18 +438,42 @@ function jl_array_grow_at(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t)
             memmove!(newdata + nb1 + nbinc, newdata + nb1, n * elsz - nb1)
         end
         a.data = data = newdata
+    elseif has_gap
+        memmove!(data + nb1 + nbinc, data + nb1, n * elsz - nb1)
     end
 
     newnrows = n + inc
     a.length = newnrows
     a.nrows = newnrows
-    zero_fill!(data + idx * elsz, inc * elsz)
+    zero_fill!(data + nb1, nbinc)
+    return
+end
+
+"""
+    jl_array_grow_at(a, idx, inc)
+
+Grows array `a` by `inc` elements at index `idx`.
+"""
+function jl_array_grow_at(a::Array1D, idx::Cssize_t, inc::Csize_t)
+    jl_array_grow_at_impl(a, Csize_t(idx), inc, a.nrows)
     return
 end
 
+compile(
+    jl_array_grow_at,
+    Cvoid,
+    (Array1D, Cssize_t, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Cssize_t), convert(LLVMType, Csize_t)])
+
+"""
+    jl_array_grow_end(a, inc)
+
+Grows array `a` by `inc` elements at the end.
+"""
 function jl_array_grow_end(a::Array1D, inc::Csize_t)
     n = a.nrows
-    jl_array_grow_at(a, n, inc, n)
+    jl_array_grow_at_impl(a, n, inc, n)
     return
 end
 

From f6d7b83422faf65e9f16dbe25c584608f883595a Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 6 Jun 2019 14:49:30 +0200
Subject: [PATCH 124/146] Implement 'jl_array_grow_beg'

---
 src/compiler/optim.jl |  2 +-
 src/device/runtime.jl | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 788f3912..db230eae 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -959,7 +959,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
                 end
             end
             changed_any = true
-        elseif name in [:jl_array_grow_end, :jl_array_grow_at, :jl_array_sizehint]
+        elseif name in [:jl_array_grow_at, :jl_array_grow_beg, :jl_array_grow_end, :jl_array_sizehint]
             let builder = Builder(JuliaContext())
                 position!(builder, call)
                 new_call = call!(builder, Runtime.get(name), args)
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index f0130968..4a167c2d 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -367,7 +367,7 @@ function zero_fill!(ptr::Ptr{UInt8}, count::Integer)
 end
 
 function memmove!(dst::Ptr{UInt8}, src::Ptr{UInt8}, sz::Integer)
-    if src < dst
+    if dst < src
         for i in 1:sz
             unsafe_store!(dst, unsafe_load(src, i), i)
         end
@@ -376,6 +376,7 @@ function memmove!(dst::Ptr{UInt8}, src::Ptr{UInt8}, sz::Integer)
             unsafe_store!(dst, unsafe_load(src, i), i)
         end
     end
+    return
 end
 
 # Resize the buffer to a max size of `newlen`
@@ -484,6 +485,23 @@ compile(
     () -> convert(LLVMType, Cvoid),
     () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
 
+"""
+    jl_array_grow_beg(a, inc)
+
+Grows array `a` by `inc` elements at the beginning of the array.
+"""
+function jl_array_grow_beg(a::Array1D, inc::Csize_t)
+    jl_array_grow_at_impl(a, Csize_t(0), inc, a.nrows)
+    return
+end
+
+compile(
+    jl_array_grow_beg,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
 """
     jl_array_sizehint(a, sz)
 

From 272e77e8b6e6f906a2982c42e68c5a6709f974c9 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 6 Jun 2019 15:09:44 +0200
Subject: [PATCH 125/146] Implement array deletion methods

---
 src/compiler/optim.jl | 11 +++++++-
 src/device/runtime.jl | 58 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index db230eae..69aa9174 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -942,6 +942,15 @@ end
 # Lowers function calls that pertain to array operations.
 function lower_array_calls!(fun::LLVM.Function, malloc)
     changed_any = false
+    runtime_methods = [
+        :jl_array_grow_at,
+        :jl_array_grow_beg,
+        :jl_array_grow_end,
+        :jl_array_del_at,
+        :jl_array_del_beg,
+        :jl_array_del_end,
+        :jl_array_sizehint
+    ]
     visit_literal_pointer_calls(fun) do call, name
         args = collect(operands(call))[1:end - 1]
         if name == :jl_alloc_array_1d
@@ -959,7 +968,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
                 end
             end
             changed_any = true
-        elseif name in [:jl_array_grow_at, :jl_array_grow_beg, :jl_array_grow_end, :jl_array_sizehint]
+        elseif name in runtime_methods
             let builder = Builder(JuliaContext())
                 position!(builder, call)
                 new_call = call!(builder, Runtime.get(name), args)
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index 4a167c2d..a1a3f4ff 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -530,4 +530,62 @@ compile(
     () -> convert(LLVMType, Cvoid),
     () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
 
+"""
+    jl_array_del_at_impl(a, idx, dec, n)
+
+Removes a range of elements from array `a`.
+"""
+function jl_array_del_at_impl(a::Array1D, idx::Csize_t, dec::Csize_t, n::Csize_t)
+    data = a.data
+    elsz = a.elsize
+    last = idx + dec
+    if n > last
+        memmove!(data + idx * elsz, data + last * elsz, (n - last) * elsz)
+    end
+    n -= dec
+    if elsz == 1
+        Base.unsafe_store!(data, n + 1, UInt8(0))
+    end
+    a.nrows = n
+    a.length = n
+    return
+end
+
+function jl_array_del_beg(a::Array1D, dec::Csize_t)
+    jl_array_del_at_impl(a, Csize_t(0), dec, a.nrows)
+    return
+end
+
+compile(
+    jl_array_del_beg,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
+function jl_array_del_end(a::Array1D, dec::Csize_t)
+    n = a.nrows
+    jl_array_del_at_impl(a, n, dec, n)
+    return
+end
+
+compile(
+    jl_array_del_end,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
+function jl_array_del_at(a::Array1D, idx::Cssize_t, dec::Csize_t)
+    jl_array_del_at_impl(a, Csize_t(idx), dec, a.nrows)
+    return
+end
+
+compile(
+    jl_array_del_at,
+    Cvoid,
+    (Array1D, Cssize_t, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Cssize_t), convert(LLVMType, Csize_t)])
+
 end

From 23fa152201e6f0b3efe9122f82d903dd8d236acd Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 6 Jun 2019 15:10:22 +0200
Subject: [PATCH 126/146] Create an array feature-testing benchmark

---
 gc-benchmarks/array-features.jl | 77 +++++++++++++++++++++++++++++++++
 gc-benchmarks/run-all.jl        |  1 +
 2 files changed, 78 insertions(+)
 create mode 100644 gc-benchmarks/array-features.jl

diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl
new file mode 100644
index 00000000..078a0f8d
--- /dev/null
+++ b/gc-benchmarks/array-features.jl
@@ -0,0 +1,77 @@
+module ArrayFeatures
+
+using CUDAdrv, CUDAnative
+
+# This benchmark has every thread exercise the entire low-level
+# array API.
+
+const thread_count = 256
+
+# Creates an array of Fibonacci numbers.
+function fib_array(count::Integer)
+    result = [1, 1]
+    # Calls `jl_array_sizehint`.
+    sizehint!(result, count + 2)
+    for i in 1:count
+        # Calls `jl_array_grow_end`.
+        push!(result, result[i] + result[i + 1])
+    end
+    return result
+end
+
+function intersperse_with!(vec::Vector{T}, value::T) where T
+    for i in 1:length(vec)
+        # Calls `jl_array_grow_at`.
+        insert!(vec, i * 2, value)
+    end
+    return vec
+end
+
+function manipulate_array()
+    # Initialize the array as a Fibonacci sequence.
+    arr = fib_array(20)
+
+    # Intersperse the array with constants.
+    intersperse_with!(arr, 2)
+
+    # Prepend a constant to the array (calls `jl_array_grow_beg`).
+    pushfirst!(arr, 2)
+
+    # Intersperse again.
+    intersperse_with!(arr, 4)
+
+    # Delete the first element (calls `jl_array_del_beg`).
+    popfirst!(arr)
+
+    # Delete the last element (calls `jl_array_del_end`).
+    pop!(arr)
+
+    # Delete some other element (calls `jl_array_del_at`).
+    deleteat!(arr, 8)
+
+    result = 0
+    for i in arr
+        result += i
+    end
+    return result
+end
+
+function kernel(destination)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    unsafe_store!(destination, manipulate_array(), i)
+    return
+end
+
+end
+
+function array_features_benchmark()
+    destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int) * ArrayFeatures.thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int}, destination_array)
+
+    # Run the kernel.
+    @cuda_sync threads=ArrayFeatures.thread_count ArrayFeatures.kernel(destination_pointer)
+
+    @test download(Int, destination_array, ArrayFeatures.thread_count) == fill(ArrayFeatures.manipulate_array(), ArrayFeatures.thread_count)
+end
+
+@cuda_benchmark "array features" array_features_benchmark()
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 5a12b676..97e3582c 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -3,6 +3,7 @@ using CUDAdrv, CUDAnative, Test
 include("utils.jl")
 
 include("array-expansion.jl")
+include("array-features.jl")
 include("array-reduction.jl")
 include("arrays.jl")
 include("binary-tree.jl")

From 69a9dd5e4b19b9f3766fecd596af34f7fa71b98d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 6 Jun 2019 15:13:34 +0200
Subject: [PATCH 127/146] Tweak a comment

---
 gc-benchmarks/array-features.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl
index 078a0f8d..62441dc2 100644
--- a/gc-benchmarks/array-features.jl
+++ b/gc-benchmarks/array-features.jl
@@ -2,7 +2,7 @@ module ArrayFeatures
 
 using CUDAdrv, CUDAnative
 
-# This benchmark has every thread exercise the entire low-level
+# This benchmark has every thread exercise the core low-level
 # array API.
 
 const thread_count = 256

From 5a939f8f1f92a2825368b7fdb9a00d6052d66667 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 7 Jun 2019 18:48:14 +0200
Subject: [PATCH 128/146] Implement jl_alloc_array_2d and jl_alloc_array_3d

---
 gc-benchmarks/array-features.jl | 21 ++++++++++++++++-----
 src/compiler/optim.jl           |  9 +++++++--
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl
index 62441dc2..d4c1dc31 100644
--- a/gc-benchmarks/array-features.jl
+++ b/gc-benchmarks/array-features.jl
@@ -9,6 +9,7 @@ const thread_count = 256
 
 # Creates an array of Fibonacci numbers.
 function fib_array(count::Integer)
+    # Calls `jl_alloc_array_1d`.
     result = [1, 1]
     # Calls `jl_array_sizehint`.
     sizehint!(result, count + 2)
@@ -27,6 +28,14 @@ function intersperse_with!(vec::Vector{T}, value::T) where T
     return vec
 end
 
+function iterative_sum(array)
+    result = 0
+    for i in array
+        result += i
+    end
+    return result
+end
+
 function manipulate_array()
     # Initialize the array as a Fibonacci sequence.
     arr = fib_array(20)
@@ -49,11 +58,13 @@ function manipulate_array()
     # Delete some other element (calls `jl_array_del_at`).
     deleteat!(arr, 8)
 
-    result = 0
-    for i in arr
-        result += i
-    end
-    return result
+    # Create a two-dimensional array (calls `jl_alloc_array_2d`).
+    arr_2d = fill(2, (2, 2))
+
+    # Create a three-dimensional array (calls `jl_alloc_array_3d`).
+    arr_3d = fill(2, (2, 2, 2))
+
+    return iterative_sum(arr) + iterative_sum(arr_2d) + iterative_sum(arr_3d)
 end
 
 function kernel(destination)
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 69aa9174..d212d69f 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -942,6 +942,11 @@ end
 # Lowers function calls that pertain to array operations.
 function lower_array_calls!(fun::LLVM.Function, malloc)
     changed_any = false
+    alloc_methods = [
+        :jl_alloc_array_1d,
+        :jl_alloc_array_2d,
+        :jl_alloc_array_3d
+    ]
     runtime_methods = [
         :jl_array_grow_at,
         :jl_array_grow_beg,
@@ -953,7 +958,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
     ]
     visit_literal_pointer_calls(fun) do call, name
         args = collect(operands(call))[1:end - 1]
-        if name == :jl_alloc_array_1d
+        if name in alloc_methods
             is_ptr, array_type_ptr = to_literal_pointer(args[1])
             if is_ptr
                 # We can lower array creation calls if we know the type
@@ -961,7 +966,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
                 array_type = unsafe_pointer_to_objref(array_type_ptr)
                 let builder = Builder(JuliaContext())
                     position!(builder, call)
-                    new_array = new_array!(builder, malloc, array_type, (args[2],))
+                    new_array = new_array!(builder, malloc, array_type, Tuple(args[2:end]))
                     replace_uses!(call, new_array)
                     unsafe_delete!(LLVM.parent(call), call)
                     dispose(builder)

From 338277234fa7b5b1f3b636583ffe0c717bace48e Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 7 Jun 2019 18:56:16 +0200
Subject: [PATCH 129/146] Better document array functions

---
 src/device/runtime.jl | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index a1a3f4ff..dbba689a 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -410,7 +410,8 @@ end
 """
     jl_array_grow_at_impl(a, idx, inc, n)
 
-Grows array `a` containing `n` elements by `inc` elements at index `idx`.
+Grows one-dimensional array `a` containing `n` elements by `inc` elements at
+zero-based index `idx`.
 """
 function jl_array_grow_at_impl(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t)
     data = a.data
@@ -453,7 +454,7 @@ end
 """
     jl_array_grow_at(a, idx, inc)
 
-Grows array `a` by `inc` elements at index `idx`.
+Grows one-dimensional array `a` by `inc` elements at zero-based index `idx`.
 """
 function jl_array_grow_at(a::Array1D, idx::Cssize_t, inc::Csize_t)
     jl_array_grow_at_impl(a, Csize_t(idx), inc, a.nrows)
@@ -470,7 +471,7 @@ compile(
 """
     jl_array_grow_end(a, inc)
 
-Grows array `a` by `inc` elements at the end.
+Grows one-dimensional array `a` by `inc` elements at the end.
 """
 function jl_array_grow_end(a::Array1D, inc::Csize_t)
     n = a.nrows
@@ -488,7 +489,7 @@ compile(
 """
     jl_array_grow_beg(a, inc)
 
-Grows array `a` by `inc` elements at the beginning of the array.
+Grows one-dimensional array `a` by `inc` elements at the beginning of the array.
 """
 function jl_array_grow_beg(a::Array1D, inc::Csize_t)
     jl_array_grow_at_impl(a, Csize_t(0), inc, a.nrows)
@@ -505,7 +506,7 @@ compile(
 """
     jl_array_sizehint(a, sz)
 
-Suggest that collection `a` reserve capacity for at least `sz` elements.
+Suggest that one-dimensional array `a` reserve capacity for at least `sz` elements.
 """
 function jl_array_sizehint(a::Array1D, sz::Csize_t)
     n = a.length
@@ -533,7 +534,8 @@ compile(
 """
     jl_array_del_at_impl(a, idx, dec, n)
 
-Removes a range of elements from array `a`.
+Removes `dec` elements from one-dimensional array `a`, starting at zero-based index `idx`.
+`n` is the number of elements in `a`.
 """
 function jl_array_del_at_impl(a::Array1D, idx::Csize_t, dec::Csize_t, n::Csize_t)
     data = a.data
@@ -551,6 +553,11 @@ function jl_array_del_at_impl(a::Array1D, idx::Csize_t, dec::Csize_t, n::Csize_t
     return
 end
 
+"""
+    jl_array_del_beg(a, dec)
+
+Removes `dec` elements from the beginning of one-dimensional array `a`.
+"""
 function jl_array_del_beg(a::Array1D, dec::Csize_t)
     jl_array_del_at_impl(a, Csize_t(0), dec, a.nrows)
     return
@@ -563,6 +570,11 @@ compile(
     () -> convert(LLVMType, Cvoid),
     () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
 
+"""
+    jl_array_del_end(a, dec)
+
+Removes `dec` elements from the end of one-dimensional array `a`.
+"""
 function jl_array_del_end(a::Array1D, dec::Csize_t)
     n = a.nrows
     jl_array_del_at_impl(a, n, dec, n)
@@ -576,6 +588,12 @@ compile(
     () -> convert(LLVMType, Cvoid),
     () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
 
+
+"""
+    jl_array_del_at(a, idx, dec)
+
+Removes `dec` elements from one-dimensional array `a`, starting at zero-based index `idx`.
+"""
 function jl_array_del_at(a::Array1D, idx::Cssize_t, dec::Csize_t)
     jl_array_del_at_impl(a, Csize_t(idx), dec, a.nrows)
     return

From 194265986811c8fa966091b553dc1722c99c3ae2 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 10 Jun 2019 12:47:32 +0200
Subject: [PATCH 130/146] Implement jl_new_array

---
 gc-benchmarks/array-features.jl |  5 +++-
 src/compiler/optim.jl           | 47 +++++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl
index d4c1dc31..317e4aab 100644
--- a/gc-benchmarks/array-features.jl
+++ b/gc-benchmarks/array-features.jl
@@ -64,7 +64,10 @@ function manipulate_array()
     # Create a three-dimensional array (calls `jl_alloc_array_3d`).
     arr_3d = fill(2, (2, 2, 2))
 
-    return iterative_sum(arr) + iterative_sum(arr_2d) + iterative_sum(arr_3d)
+    # Create a four-dimensional array (calls `jl_new_array`).
+    arr_4d = fill(2, (2, 2, 2, 2))
+
+    return iterative_sum(arr) + iterative_sum(arr_2d) + iterative_sum(arr_3d) + iterative_sum(arr_4d)
 end
 
 function kernel(destination)
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index d212d69f..b58a2ec7 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -939,13 +939,49 @@ function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple
     return obj_ptr
 end
 
+# Generates code that extracts array dimensions from a tuple argument.
+function extract_array_dims!(builder, ::Type{Array{T, N}}, dims_tuple) where {T, N}
+    # First cast the tuple value to a size_t pointer in address space zero.
+    tuple_as_size_t = bitcast!(
+        builder,
+        addrspacecast!(
+            builder,
+            dims_tuple,
+            LLVM.PointerType(eltype(llvmtype(dims_tuple)))),
+        LLVM.PointerType(convert(LLVMType, Csize_t)))
+
+    is_literal, ptr = to_literal_pointer(tuple_as_size_t)
+
+    results = []
+    if is_literal
+        # If the tuple is implemented as a literal pointer, then we want to load its elements
+        # ahead of time; the device won't be able to access host-allocated constants.
+        for i in 1:N
+            value = Base.unsafe_load(Base.unsafe_convert(Ptr{Csize_t}, ptr), i)
+            push!(results, LLVM.ConstantInt(convert(LLVMType, Csize_t), value))
+        end
+    else
+        # Otherwise, generate code that loads fields from the tuple.
+        for i in 1:N
+            address = gep!(
+                builder,
+                tuple_as_size_t,
+                [LLVM.ConstantInt(convert(LLVMType, Int32), i)])
+
+            push!(results, load!(builder, address))
+        end
+    end
+    return Tuple(results)
+end
+
 # Lowers function calls that pertain to array operations.
 function lower_array_calls!(fun::LLVM.Function, malloc)
     changed_any = false
     alloc_methods = [
         :jl_alloc_array_1d,
         :jl_alloc_array_2d,
-        :jl_alloc_array_3d
+        :jl_alloc_array_3d,
+        :jl_new_array
     ]
     runtime_methods = [
         :jl_array_grow_at,
@@ -966,7 +1002,14 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
                 array_type = unsafe_pointer_to_objref(array_type_ptr)
                 let builder = Builder(JuliaContext())
                     position!(builder, call)
-                    new_array = new_array!(builder, malloc, array_type, Tuple(args[2:end]))
+                    if name == :jl_new_array
+                        # jl_new_array requires special treatment. All the other ones are
+                        # pretty simple to handle.
+                        dim_args = extract_array_dims!(builder, array_type, args[2])
+                    else
+                        dim_args = Tuple(args[2:end])
+                    end
+                    new_array = new_array!(builder, malloc, array_type, dim_args)
                     replace_uses!(call, new_array)
                     unsafe_delete!(LLVM.parent(call), call)
                     dispose(builder)

From 8612466ead3455c5aa60db6acd4a4731339f8d73 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 10 Jun 2019 14:09:12 +0200
Subject: [PATCH 131/146] Implement jl_ptr_to_array{,_1d}

---
 gc-benchmarks/array-features.jl | 14 ++++++++++++-
 src/compiler/optim.jl           | 35 +++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl
index 317e4aab..9f8cde52 100644
--- a/gc-benchmarks/array-features.jl
+++ b/gc-benchmarks/array-features.jl
@@ -67,7 +67,19 @@ function manipulate_array()
     # Create a four-dimensional array (calls `jl_new_array`).
     arr_4d = fill(2, (2, 2, 2, 2))
 
-    return iterative_sum(arr) + iterative_sum(arr_2d) + iterative_sum(arr_3d) + iterative_sum(arr_4d)
+    # Create an alias for the Fibonacci array (this is dangerous, but we
+    # know what we're doing here; calls `jl_ptr_to_array_1d`).
+    alias = unsafe_wrap(Array, pointer(arr), length(arr))
+
+    # Create an alias for `arr_2d` (calls `jl_ptr_to_array`).
+    alias_2d = unsafe_wrap(Array, pointer(arr_2d), size(arr_2d))
+
+    return iterative_sum(arr) +
+        iterative_sum(arr_2d) +
+        iterative_sum(arr_3d) +
+        iterative_sum(arr_4d) +
+        iterative_sum(alias) +
+        iterative_sum(alias_2d)
 end
 
 function kernel(destination)
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index b58a2ec7..bacaac0d 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -775,7 +775,7 @@ end
 # Emits instructions that create a new array. The array's element type
 # must be statically known. Its dimensions are represented as a tuple
 # of LLVM IR values. A pointer to the new array is returned.
-function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple)
+function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple; data_ptr::Union{Nothing,LLVM.Value} = nothing)
     # Since time immemorial, the structure of an array is (quoting from the
     # Julia source code here):
     #
@@ -859,7 +859,9 @@ function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple
     # Actually allocate the array's contents. We will just always
     # use a separate buffer. Inline data storage is wasteful and
     # harder to implement.
-    data_ptr = new_bytes!(builder, malloc, data_bytesize)
+    if data_ptr == nothing
+        data_ptr = new_bytes!(builder, malloc, data_bytesize)
+    end
 
     # The pointer to the array's data is the first field of the struct.
     push!(fields, data_ptr)
@@ -966,7 +968,7 @@ function extract_array_dims!(builder, ::Type{Array{T, N}}, dims_tuple) where {T,
             address = gep!(
                 builder,
                 tuple_as_size_t,
-                [LLVM.ConstantInt(convert(LLVMType, Int32), i)])
+                [LLVM.ConstantInt(convert(LLVMType, Int32), i - 1)])
 
             push!(results, load!(builder, address))
         end
@@ -983,6 +985,10 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
         :jl_alloc_array_3d,
         :jl_new_array
     ]
+    wrap_methods = [
+        :jl_ptr_to_array,
+        :jl_ptr_to_array_1d
+    ]
     runtime_methods = [
         :jl_array_grow_at,
         :jl_array_grow_beg,
@@ -1014,8 +1020,28 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
                     unsafe_delete!(LLVM.parent(call), call)
                     dispose(builder)
                 end
+                changed_any = true
+            end
+        elseif name in wrap_methods
+            is_ptr, array_type_ptr = to_literal_pointer(args[1])
+            if is_ptr
+                # We can lower array wrapping calls if we know the type
+                # of the array to create in advance.
+                array_type = unsafe_pointer_to_objref(array_type_ptr)
+                let builder = Builder(JuliaContext())
+                    position!(builder, call)
+                    if name == :jl_ptr_to_array
+                        dim_args = extract_array_dims!(builder, array_type, args[3])
+                    else
+                        dim_args = (args[3],)
+                    end
+                    new_array = new_array!(builder, malloc, array_type, dim_args; data_ptr=args[2])
+                    replace_uses!(call, new_array)
+                    unsafe_delete!(LLVM.parent(call), call)
+                    dispose(builder)
+                end
+                changed_any = true
             end
-            changed_any = true
         elseif name in runtime_methods
             let builder = Builder(JuliaContext())
                 position!(builder, call)
@@ -1024,6 +1050,7 @@ function lower_array_calls!(fun::LLVM.Function, malloc)
                 unsafe_delete!(LLVM.parent(call), call)
                 dispose(builder)
             end
+            changed_any = true
         end
     end
     return changed_any

From 952a645cda8c532e6fa453804e0fe36ba6d16c42 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 11 Jun 2019 16:59:19 +0200
Subject: [PATCH 132/146] Compare GC strategies when running benchmarks

---
 gc-benchmarks/run-all.jl | 16 +++++++++++++-
 gc-benchmarks/utils.jl   | 47 ++++++++++++++++++++++++++++++++--------
 2 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 97e3582c..46449f85 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -19,8 +19,10 @@ results = run_benchmarks()
 # Print the results to the terminal.
 println(results)
 
+gc_tags = [t for t in benchmark_tags if startswith(t, "gc")]
+
 # Also write them to a CSV for further analysis.
-open("results.csv", "w") do file
+open("strategies.csv", "w") do file
     write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n")
     for key in sort([k for k in keys(results)])
         runs = results[key]
@@ -35,3 +37,15 @@ open("results.csv", "w") do file
         write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,$bump_time,1,$gc_ratio,$gc_shared_ratio,$bump_ratio\n")
     end
 end
+
+open("gc-heap-sizes.csv", "w") do file
+    ratio_tags = [t * "-ratio" for t in gc_tags]
+    write(file, "benchmark,$(join(gc_tags, ',')),$(join(ratio_tags, ','))\n")
+    for key in sort([k for k in keys(results)])
+        runs = results[key]
+        median_times = BenchmarkTools.median(runs)
+        times = [median_times[t].time / 1e6 for t in gc_tags]
+        normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags]
+        write(file, "$key,$(join(times, ',')),$(join(normalized_times, ','))\n")
+    end
+end
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index eca5b0cf..822954a3 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -67,19 +67,48 @@ end
 
 const MiB = 1 << 20
 
+benchmark_tags = [
+    "gc", "gc-shared",
+    "gc-30mb", "gc-shared-30mb",
+    "gc-15mb", "gc-shared-15mb",
+    "gc-7.5mb", "gc-shared-7.5mb",
+    "gc-3.75mb", "gc-shared-3.75mb",
+    "nogc", "bump"
+]
+
 macro cuda_benchmark(name, ex)
     esc(quote
-        suite[$name] = BenchmarkTools.BenchmarkGroup(["gc", "gc-shared", "nogc", "bump"])
-        register_cuda_benchmark($name, "gc") do
-            global gc_mode = "gc"
-            global gc_config = GCConfiguration(local_arena_count=8, local_arena_initial_size=MiB, global_arena_initial_size=2 * MiB)
-            $(ex)
+        local function register_gc(config, heap_size)
+            register_cuda_benchmark($name, config) do
+                global gc_mode = "gc"
+                global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=heap_size)
+                $(ex)
+            end
         end
-        register_cuda_benchmark($name, "gc-shared") do
-            global gc_mode = "gc"
-            global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=10 * MiB)
-            $(ex)
+        local function register_gc_shared(config, heap_size)
+            register_cuda_benchmark($name, config) do
+                global gc_mode = "gc"
+                local local_arena_initial_size = div(heap_size, 10)
+                local global_arena_initial_size = heap_size - 8 * local_arena_initial_size
+                global gc_config = GCConfiguration(
+                    local_arena_count=8,
+                    local_arena_initial_size=local_arena_initial_size,
+                    global_arena_initial_size=global_arena_initial_size)
+                $(ex)
+            end
         end
+
+        suite[$name] = BenchmarkTools.BenchmarkGroup(benchmark_tags)
+        register_gc("gc", 60 * MiB)
+        register_gc_shared("gc-shared", 60 * MiB)
+        register_gc("gc-30mb", 30 * MiB)
+        register_gc_shared("gc-shared-30mb", 30 * MiB)
+        register_gc("gc-15mb", 15 * MiB)
+        register_gc_shared("gc-shared-15mb", 15 * MiB)
+        register_gc("gc-7.5mb", div(15 * MiB, 2))
+        register_gc_shared("gc-shared-7.5mb", div(15 * MiB, 2))
+        register_gc("gc-3.75mb", div(15 * MiB, 4))
+        register_gc_shared("gc-shared-3.75mb", div(15 * MiB, 4))
         register_cuda_benchmark($name, "nogc") do
             global gc_mode = "nogc"
             $(ex)

From be276cf62b9a9e8bcbd02c319a63cf7ee1bb96ce Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 11 Jun 2019 17:14:57 +0200
Subject: [PATCH 133/146] Tweak array-features benchmark

---
 gc-benchmarks/array-features.jl | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl
index 9f8cde52..c27a876a 100644
--- a/gc-benchmarks/array-features.jl
+++ b/gc-benchmarks/array-features.jl
@@ -74,17 +74,26 @@ function manipulate_array()
     # Create an alias for `arr_2d` (calls `jl_ptr_to_array`).
     alias_2d = unsafe_wrap(Array, pointer(arr_2d), size(arr_2d))
 
+    # Create an array that is similar to `arr_3d` and fill it with constants.
+    # This does not call any new low-level functions, but it does illustrate
+    # that high-level functions such as `similar` and `fill!` fully functional.
+    arr_3d_sim = similar(arr_3d)
+    fill!(arr_3d_sim, 10)
+
     return iterative_sum(arr) +
         iterative_sum(arr_2d) +
         iterative_sum(arr_3d) +
         iterative_sum(arr_4d) +
         iterative_sum(alias) +
-        iterative_sum(alias_2d)
+        iterative_sum(alias_2d) +
+        iterative_sum(arr_3d_sim)
 end
 
 function kernel(destination)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    unsafe_store!(destination, manipulate_array(), i)
+    for j in 1:3
+        unsafe_store!(destination, manipulate_array(), i)
+    end
     return
 end
 

From 85766d526ed761cad5631c3ca3277cc4cad442c4 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 11 Jun 2019 17:32:29 +0200
Subject: [PATCH 134/146] Update optim.jl to use stock Julia

---
 src/compiler/optim.jl | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index bacaac0d..976b700a 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -19,7 +19,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
     #
     # NOTE: we need to use multiple distinct pass managers to force pass ordering;
     #       intrinsics should never get lowered before Julia has optimized them.
-    if VERSION < v"1.2.0-DEV.375"
+    if VERSION < v"1.3.0-DEV.390"
         # with older versions of Julia, intrinsics are lowered unconditionally so we need to
         # replace them with GPU-compatible counterparts before anything else. that breaks
         # certain optimizations though: https://github.com/JuliaGPU/CUDAnative.jl/issues/340
@@ -44,7 +44,8 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
             initialize!(pm)
             ccall(:jl_add_optimization_passes, Cvoid,
                   (LLVM.API.LLVMPassManagerRef, Cint, Cint),
-                  LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 1)
+                  LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0)
+            ccall(:LLVMExtraAddLateLowerGCFramePass, Cvoid, (LLVM.API.LLVMPassManagerRef,), LLVM.ref(pm))
             run!(pm, mod)
         end
 
@@ -62,11 +63,6 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
             aggressive_dce!(pm) # remove dead uses of ptls
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
-            # the Julia GC lowering pass also has some clean-up that is required
-            if VERSION >= v"1.2.0-DEV.531"
-                late_lower_gc_frame!(pm)
-            end
-
             run!(pm, mod)
         end
         replace_malloc!(mod, job.malloc)

From 2e640f5361a7de0a81a51656856dd6b936675b35 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 11 Jun 2019 17:34:47 +0200
Subject: [PATCH 135/146] Fix misnomer in utils.jl

---
 gc-benchmarks/utils.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 822954a3..6ceca63d 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -78,14 +78,14 @@ benchmark_tags = [
 
 macro cuda_benchmark(name, ex)
     esc(quote
-        local function register_gc(config, heap_size)
+        local function register_gc_shared(config, heap_size)
             register_cuda_benchmark($name, config) do
                 global gc_mode = "gc"
                 global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=heap_size)
                 $(ex)
             end
         end
-        local function register_gc_shared(config, heap_size)
+        local function register_gc(config, heap_size)
             register_cuda_benchmark($name, config) do
                 global gc_mode = "gc"
                 local local_arena_initial_size = div(heap_size, 10)

From bb7b44026e3bc5dcdab07f290309b5739cfee951 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 11 Jun 2019 17:43:09 +0200
Subject: [PATCH 136/146] Include mean in gc-heap-sizes.csv

---
 gc-benchmarks/run-all.jl | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 46449f85..eb17da9b 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -1,4 +1,4 @@
-using CUDAdrv, CUDAnative, Test
+using CUDAdrv, CUDAnative, Test, Statistics
 
 include("utils.jl")
 
@@ -41,11 +41,20 @@ end
 open("gc-heap-sizes.csv", "w") do file
     ratio_tags = [t * "-ratio" for t in gc_tags]
     write(file, "benchmark,$(join(gc_tags, ',')),$(join(ratio_tags, ','))\n")
+    all_times = [[] for t in gc_tags]
+    all_normalized_times = [[] for t in gc_tags]
     for key in sort([k for k in keys(results)])
         runs = results[key]
         median_times = BenchmarkTools.median(runs)
         times = [median_times[t].time / 1e6 for t in gc_tags]
+        for (l, val) in zip(all_times, times)
+            push!(l, val)
+        end
         normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags]
+        for (l, val) in zip(all_normalized_times, normalized_times)
+            push!(l, val)
+        end
         write(file, "$key,$(join(times, ',')),$(join(normalized_times, ','))\n")
     end
+    write(file, "mean,$(join(map(mean, all_times), ',')),$(join(map(mean, all_normalized_times), ','))\n")
 end

From be1692c14de80f246ad0a9bbb06008a382b23153 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 11 Jun 2019 17:48:44 +0200
Subject: [PATCH 137/146] Remove experimental allocator implementations

---
 src/gc.jl | 627 ------------------------------------------------------
 1 file changed, 627 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index f329c8e9..1fce27c4 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -107,42 +107,6 @@ end
 # Gets a free list arena's lock.
 get_lock(arena::Ptr{FreeListArena}) = ReaderWriterLock(@get_field_pointer(arena, :lock_state))
 
-# A data structure that describes a ScatterAlloc superblock. Every
-# superblock is prefixed by one of these.
-struct ScatterAllocSuperblock
-    # The number of regions in the superblock.
-    region_count::UInt32
-
-    # The number of pages in a region managed by this superblock.
-    pages_per_region::UInt32
-
-    # The size of a page in the superblock, in bytes. This size
-    # does not include the page's header.
-    page_size::UInt32
-
-    # A pointer to the next superblock.
-    next::Ptr{ScatterAllocSuperblock}
-end
-
-# A region in a ScatterAlloc superblock.
-struct ScatterAllocRegion
-    # The number of pages in this region that are full.
-    full_page_count::Int64
-end
-
-# A page in a ScatterAlloc region.
-struct ScatterAllocPage
-    # The size of a chunk in this page.
-    chunk_size::Int64
-
-    # The number of allocated blocks in this page.
-    allocated_chunk_count::Int64
-
-    # A bitmask that describes which chunks have been allocated
-    # and which chunks are still free.
-    occupancy::Int64
-end
-
 const gc_align = Csize_t(16)
 
 # Aligns a pointer to an alignment boundary.
@@ -170,17 +134,6 @@ function align_upward(offset::T, alignment::Csize_t = gc_align)::T where T <: In
     convert(T, Csize_t(align_upward(convert(Ptr{UInt8}, Csize_t(offset)), alignment)))
 end
 
-# Gets the page size in a superblock. This size does not include
-# the page header.
-function page_size(superblock::Ptr{ScatterAllocSuperblock})
-    unsafe_load(@get_field_pointer(superblock, :page_size))
-end
-
-# Gets the number of pages per region in a superblock.
-function pages_per_region(superblock::Ptr{ScatterAllocSuperblock})
-    unsafe_load(@get_field_pointer(superblock, :pages_per_region))
-end
-
 # Gets the size of an aligned header, including padding to satisfy
 # alignment requirements.
 @generated function header_size(::Type{T}, ::Val{alignment} = Val(gc_align))::UInt32 where {T, alignment}
@@ -188,107 +141,6 @@ end
     :($result)
 end
 
-# Gets the total number of chunks in a particular page.
-function chunk_count(page::Ptr{ScatterAllocPage}, superblock::Ptr{ScatterAllocSuperblock})
-    chunk_size = unsafe_load(@get_field_pointer(page, :chunk_size))
-    div(page_size(superblock), chunk_size)
-end
-
-# Gets the address of a particular chunk in a page. `index` is zero-based.
-function chunk_address(page::Ptr{ScatterAllocPage}, index::Integer)::Ptr{UInt8}
-    chunk_size = unsafe_load(@get_field_pointer(page, :chunk_size))
-    Base.unsafe_convert(Ptr{UInt8}, page + header_size(ScatterAllocPage) + chunk_size * index)
-end
-
-# Gets the address of a particular page in a region. `index` is zero-based.
-function page_address(region::Ptr{ScatterAllocRegion}, superblock::Ptr{ScatterAllocSuperblock}, index::Integer)::Ptr{ScatterAllocPage}
-    Base.unsafe_convert(
-        Ptr{ScatterAllocPage},
-        region + header_size(ScatterAllocRegion) + index * (header_size(ScatterAllocPage) + page_size(superblock)))
-end
-
-# Gets the total size in bytes of a region, including overhead.
-function region_bytesize(pages_per_region::Integer, page_size::Integer)
-    region_data_size = pages_per_region * (header_size(ScatterAllocPage) + page_size)
-    header_size(ScatterAllocRegion) + region_data_size
-end
-
-# Gets the address of a particular region in a superblock. `index` is zero-based.
-function region_address(superblock::Ptr{ScatterAllocSuperblock}, index::Integer)::Ptr{ScatterAllocRegion}
-    Base.unsafe_convert(
-        Ptr{ScatterAllocPage},
-        superblock + header_size(ScatterAllocSuperblock) + index * region_bytesize(pages_per_region(superblock), page_size(superblock)))
-end
-
-# A GC arena that uses the ScatterAlloc algorithm for allocations.
-struct ScatterAllocArena
-    # A pointer to the first superblock managed by this arena.
-    first_superblock::Ptr{ScatterAllocSuperblock}
-end
-
-# A "shelf" in a bodega arena. See `BodegaArena` for more info on
-# how shelves work.
-struct BodegaShelf
-    # The size of the chunks on this shelf.
-    chunk_size::Csize_t
-
-    # The maximal number of chunks on this shelf.
-    capacity::Int64
-
-    # An index into the shelf that points to the first free
-    # chunk. This is a zero-based index.
-    chunk_finger::Int64
-
-    # A pointer to an array of pointers to chunks of memory.
-    # Every chunk in this array has a chunk size that is
-    # at least as large as `chunk_size`.
-    chunks::Ptr{Ptr{UInt8}}
-end
-
-# A GC arena that uses a custom ("bodega") allocation algorithm for allocations.
-# Essentially, this type of arena has a list of "shelves" that contain small,
-# preallocated chunks of memory that threads can claim in a fast and lock-free
-# manner. When the shelves run out of memory, threads may re-stock them from free
-# list, amortizing the cost of lock acquisition across many different allocations.
-struct BodegaArena
-    # The number of shelves in the arena.
-    shelf_count::Int
-
-    # A pointer to an array of shelves.
-    shelves::Ptr{BodegaShelf}
-
-    # A Boolean that tells if it is sensible to try and restock shelves in this
-    # arena. Restocking shelves becomes futile once the free list's capacity is
-    # exhausted.
-    can_restock::Bool
-
-    # The free list this bodega uses for large allocations and for re-stocking
-    # the shelves.
-    free_list::FreeListArena
-end
-
-# Gets a pointer to a bodega arena's free list.
-function get_free_list(arena::Ptr{BodegaArena})::Ptr{FreeListArena}
-    @get_field_pointer(arena, :free_list)
-end
-
-# Gets a bodega arena's lock.
-get_lock(arena::Ptr{BodegaArena}) = get_lock(get_free_list(arena))
-
-# Gets the first shelf containing chunks that are at least `bytesize` bytes
-# in size. Returns null if there is no such shelf.
-function get_shelf(arena::Ptr{BodegaArena}, bytesize::Csize_t)::Ptr{BodegaShelf}
-    bodega = unsafe_load(arena)
-    for i in 1:bodega.shelf_count
-        shelf = bodega.shelves + (i - 1) * sizeof(BodegaShelf)
-        chunk_size = unsafe_load(@get_field_pointer(shelf, :chunk_size))
-        if chunk_size >= bytesize
-            return shelf
-        end
-    end
-    return C_NULL
-end
-
 # A reference to a Julia object.
 const ObjectRef = Ptr{Nothing}
 
@@ -329,10 +181,6 @@ struct GCMasterRecord
     # The number of local arenas.
     local_arena_count::UInt32
 
-    # A pointer to the tiny arena, which uses the ScatterAlloc
-    # algorithm to provision space for small objects.
-    tiny_arena::Ptr{ScatterAllocArena}
-
     # A pointer to a list of local GC arena pointers.
     local_arenas::Ptr{Ptr{LocalArena}}
 
@@ -569,181 +417,6 @@ function gc_add_to_free_list(
     unsafe_store!(list_ptr, entry)
 end
 
-# Tries to allocate a chunk of memory from a ScatterAlloc page.
-# Returns a null pointer if no chunk of memory can be found.
-function gc_scatter_alloc_use_page(
-    page::Ptr{ScatterAllocPage},
-    region::Ptr{ScatterAllocRegion},
-    superblock::Ptr{ScatterAllocSuperblock})::Ptr{UInt8}
-
-    alloc_chunk_ptr = @get_field_pointer(page, :allocated_chunk_count)
-    fill_level = atomic_add!(alloc_chunk_ptr, 1)
-    spots = chunk_count(page, superblock)
-    if fill_level < spots
-        if fill_level + 1 == spots
-            # The page is full now. Increment the region's counter.
-            full_page_ptr = @get_field_pointer(region, :full_page_count)
-            atomic_add!(full_page_ptr, 1)
-        end
-
-        lane_id = (get_thread_id() - 1) % warpsize()
-        spot = lane_id % spots
-        occupancy_ptr = @get_field_pointer(page, :occupancy)
-        while true
-            # Check if our preferred spot is available.
-            mask = 1 << spot
-            old = atomic_or!(occupancy_ptr, mask)
-
-            actual_fill = 0
-            for i in 1:64
-                if old & (1 << (i - 1)) != 0
-                    actual_fill += 1
-                end
-            end
-
-            # If the spot is available, then use it.
-            if old & mask == 0
-                break
-            end
-
-            # Otherwise, find a new spot.
-            spot = (spot + 1) % spots
-        end
-        return chunk_address(page, spot)
-    end
-
-    # The page is full.
-    atomic_subtract!(alloc_chunk_ptr, 1)
-    return C_NULL
-end
-
-function scatter_alloc_hash(
-    superblock::Ptr{ScatterAllocSuperblock},
-    bytesize::Int64)::Int64
-
-    sb = unsafe_load(superblock)
-    page_count = sb.region_count * sb.pages_per_region
-    warp_id = get_warp_id() - 1
-
-    k_S = 38183
-    k_mp = 17497
-
-    (bytesize * k_S + warp_id * k_mp) % page_count
-end
-
-# Tries to allocate a chunk of memory from a ScatterAlloc superblock.
-# Returns a null pointer if no sufficiently large chunk of
-# memory can be found.
-function gc_scatter_alloc_use_superblock(
-    superblock::Ptr{ScatterAllocSuperblock},
-    bytesize::Csize_t)::Ptr{UInt8}
-
-    if bytesize > page_size(superblock)
-        # This isn't going to work. The superblock's page size is just too small.
-        return C_NULL
-    end
-
-    # Choose the allocation size in such a way that we never end up with more than
-    # 64 chunks. This is necessary because the chunk occupancy bitfield is only
-    # 64 bits wide.
-    alloc_size = Int64(div(page_size(superblock), 64))
-    if alloc_size < Int64(bytesize)
-        alloc_size = Int64(bytesize)
-    end
-
-    # Align the allocation size.
-    alloc_size = align_upward(alloc_size)
-
-    # We are looking for a chunk that is `bytesize` bytes in size,
-    # but we're willing to accept a chunk that is twice as large.
-    waste_factor = 2
-    max_size = alloc_size * waste_factor
-
-    pages_per_region = unsafe_load(@get_field_pointer(superblock, :pages_per_region))
-    region_count = unsafe_load(@get_field_pointer(superblock, :region_count))
-
-    # Guess a global page index.
-    global_page_id = scatter_alloc_hash(superblock, alloc_size)
-
-    # Decompose that global page index into a region index and a
-    # local page index.
-    region_id = global_page_id % pages_per_region
-    page_id = div(global_page_id, pages_per_region)
-
-    # Remember the initial values of the region and page ids.
-    init_region_id = region_id
-    init_page_id = page_id
-
-    # Find the region and page corresponding to the current page ID.
-    region = region_address(superblock, region_id)
-    while true
-        page = page_address(region, superblock, page_id)
-
-        # Skip regions until we find a region that is sufficiently empty.
-        while true
-            region_fill_level = unsafe_load(region).full_page_count / pages_per_region
-            if region_fill_level > 0.9
-                region_id += 1
-                if region_id >= region_count
-                    region_id = 0
-                end
-                region = region_address(superblock, region_id)
-                page_id = 0
-            else
-                break
-            end
-        end
-
-        # Try to set the chunk size to our preferred chunk size.
-        chunk_size_ptr = @get_field_pointer(page, :chunk_size)
-        chunk_size = atomic_compare_exchange!(chunk_size_ptr, 0, alloc_size)
-        if chunk_size == 0 || (chunk_size >= alloc_size && chunk_size <= max_size)
-            # If we managed to set the page's chunk size, then the page is definitely
-            # suitable for our purposes. Otherwise, the page might still be suitable
-            # if its chunk size is sufficiently large to accommodate the requested
-            # size yet small enough to not waste too much space.
-            result = gc_scatter_alloc_use_page(page, region, superblock)
-            if result != C_NULL
-                return result
-            end
-        end
-
-        # Try the next page.
-        page_id += 1
-
-        if page_id >= pages_per_region
-            region_id += 1
-            if region_id >= region_count
-                region_id = 0
-            end
-            region = region_address(superblock, region_id)
-            page_id = 0
-        end
-
-        # We tried every page in the entire superblock and found nothing.
-        if region_id == init_region_id && page_id == init_page_id
-            return C_NULL
-        end
-    end
-end
-
-# Tries to allocate a chunk of memory in a particular GC arena.
-# Returns a null pointer if no sufficiently large chunk of
-# memory can be found.
-function gc_malloc_local(arena::Ptr{ScatterAllocArena}, bytesize::Csize_t)::Ptr{UInt8}
-    # Walk the list of superblocks until we find a valid candidate.
-    superblock = unsafe_load(arena).first_superblock
-    while superblock != C_NULL
-        result = gc_scatter_alloc_use_superblock(superblock, bytesize)
-        if result != C_NULL
-            return result
-        end
-        superblock = unsafe_load(@get_field_pointer(superblock, :next))
-    end
-
-    return C_NULL
-end
-
 # Tries to allocate a chunk of memory from a free list.
 # Returns a null pointer if no sufficiently large chunk of
 # memory can be found.
@@ -823,101 +496,6 @@ function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t; acquire_l
     return result_ptr
 end
 
-# Atomically takes a chunk from a shelf. Returns null if the shelf
-# is empty.
-function gc_malloc_from_shelf(shelf::Ptr{BodegaShelf})::Ptr{UInt8}
-    capacity = unsafe_load(@get_field_pointer(shelf, :capacity))
-
-    # Atomically increment the chunk finger.
-    finger_ptr = @get_field_pointer(shelf, :chunk_finger)
-    finger = atomic_add!(finger_ptr, 1)
-
-    if finger < capacity
-        # If the chunk finger was less than the capacity, then we actually
-        # managed to take a chunk from the shelf. We only need to retrieve
-        # its address.
-        chunk_array = unsafe_load(@get_field_pointer(shelf, :chunks))
-        return unsafe_load(chunk_array, finger + 1)
-    else
-        # Otherwise, we've got nothing. Return null.
-        return C_NULL
-    end
-end
-
-# Re-stocks a shelf.
-function restock_shelf(arena::Ptr{BodegaArena}, shelf::Ptr{BodegaShelf})
-    shelf_size = unsafe_load(@get_field_pointer(shelf, :chunk_size))
-    capacity = unsafe_load(@get_field_pointer(shelf, :capacity))
-    finger_ptr = @get_field_pointer(shelf, :chunk_finger)
-    finger = unsafe_load(finger_ptr)
-
-    # The finger may exceed the capacity. This is harmless. Just
-    # reset the finger to the capacity.
-    if finger > capacity
-        finger = capacity
-    end
-
-    # Actually re-stock the shelf.
-    free_list = get_free_list(arena)
-    chunk_array = unsafe_load(@get_field_pointer(shelf, :chunks))
-    while finger > 0
-        chunk = gc_malloc_from_free_list(free_list, shelf_size)
-        if chunk == C_NULL
-            # We exhausted the free list. Better break now. Also set
-            # the arena's `can_restock` flag to false so there will be
-            # no future attempts to re-stock shelves.
-            unsafe_store!(@get_field_pointer(arena, :can_restock), false)
-            break
-        end
-
-        # Update the chunk array.
-        unsafe_store!(chunk_array, chunk, finger)
-        finger -= 1
-    end
-
-    # Update the finger.
-    unsafe_store!(finger_ptr, finger)
-end
-
-# Tries to allocate a chunk of memory in a particular GC arena.
-# Returns a null pointer if no sufficiently large chunk of
-# memory can be found.
-function gc_malloc_local(arena::Ptr{BodegaArena}, bytesize::Csize_t; acquire_lock=true)::Ptr{UInt8}
-    # The bodega arena might be empty (or approximately empty). If so, then we'll
-    # just return null early. There's no need to scrape the bottom of the barrel.
-    if !unsafe_load(@get_field_pointer(arena, :can_restock))
-        return C_NULL
-    end
-
-    # Find the right shelf for this allocation.
-    shelf = get_shelf(arena, bytesize)
-    free_list = get_free_list(arena)
-    if shelf == C_NULL
-        # The shelves' chunk sizes are all too small to accommodate this
-        # allocation. Use the free list directly.
-        return gc_malloc_local(free_list, bytesize)
-    end
-
-    # Acquire a reader lock on the arena and try to take a chunk
-    # from the shelf.
-    lock = get_lock(free_list)
-    result_ptr = reader_locked(lock; acquire_lock=acquire_lock) do
-        gc_malloc_from_shelf(shelf)
-    end
-
-    if result_ptr == C_NULL
-        # Looks like we need to re-stock the shelf. While we're at it,
-        # we might as well grab a chunk of memory for ourselves.
-        result_ptr = writer_locked(lock; acquire_lock=acquire_lock) do
-            restock_shelf(arena, shelf)
-            gc_malloc_from_free_list(free_list, bytesize)
-        end
-    end
-
-    gc_protect(result_ptr)
-    return result_ptr
-end
-
 # Transfers a block of free memory from one arena to another and then
 # allocates a differently-sized block of memory from the destination
 # arena.
@@ -944,28 +522,6 @@ function gc_transfer_and_malloc(
     end
 end
 
-# Transfers a block of free memory from one arena to another and then
-# allocates a differently-sized block of memory from the destination
-# arena.
-function gc_transfer_and_malloc(
-    from_arena::Ptr{FreeListArena},
-    to_arena::Ptr{BodegaArena},
-    transfer_bytesize::Csize_t,
-    alloc_bytesize::Csize_t)::Ptr{UInt8}
-
-    result = gc_transfer_and_malloc(
-        from_arena,
-        get_free_list(to_arena),
-        transfer_bytesize,
-        alloc_bytesize)
-
-    writer_locked(get_lock(to_arena)) do
-        unsafe_store!(@get_field_pointer(to_arena, :can_restock), true)
-    end
-
-    return result
-end
-
 """
     gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
 
@@ -976,16 +532,6 @@ function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
     master_record = get_gc_master_record()
 
     function allocate()
-        # Try to allocate in the tiny arena first. The ScatterAlloc
-        # algorithm used by that arena is lock-free and works well
-        # for small objects.
-        if master_record.tiny_arena != C_NULL
-            local_ptr = gc_malloc_local(master_record.tiny_arena, bytesize)
-            if local_ptr != C_NULL
-                return local_ptr
-            end
-        end
-
         # Try to allocate in the local arena second. If that doesn't
         # work, we'll move on to the global arena, which is bigger but
         # is shared by all threads. (We want to minimize contention
@@ -1126,14 +672,6 @@ end
 # One megabyte.
 const MiB = 1 << 20
 
-# The point at which a tiny arena is deemed to be starving, i.e.,
-# it no longer contains enough memory to perform basic allocations.
-# If a tiny arena's free byte count stays below the arena starvation
-# threshold after a collection phase, the collector will allocate
-# additional memory to the arena such that it is no longer starving.
-# This arena starvation threshold is currently set to 2 MiB.
-const tiny_arena_starvation_threshold = 0 # 2 * MiB
-
 # A description of a region of memory that has been allocated to the GC heap.
 const GCHeapRegion = CUDAdrv.Mem.HostBuffer
 
@@ -1244,14 +782,6 @@ function gc_init!(
     # Compute a pointer to the start of the tiny arena.
     arena_start_ptr = rootbuf_ptr + rootbuf_bytesize
 
-    # Set up the tiny object arena.
-    if tiny_arena_starvation_threshold > 0
-        arena_for_ants = make_gc_arena!(ScatterAllocArena, arena_start_ptr, Csize_t(tiny_arena_starvation_threshold))
-        arena_start_ptr += tiny_arena_starvation_threshold
-    else
-        arena_for_ants = Base.unsafe_convert(Ptr{ScatterAllocArena}, C_NULL)
-    end
-
     # Set up local arenas.
     for i in 1:config.local_arena_count
         local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(config.local_arena_initial_size))
@@ -1267,7 +797,6 @@ function gc_init!(
         UInt32(thread_count),
         UInt32(config.root_buffer_capacity),
         UInt32(config.local_arena_count),
-        arena_for_ants,
         local_arenas_ptr,
         global_arena,
         safepoint_ptr,
@@ -1302,89 +831,6 @@ function make_gc_arena!(::Type{FreeListArena}, start_ptr::Ptr{T}, size::Csize_t)
     arena
 end
 
-# Takes a zero-filled region of memory and turns it into an arena
-# managed by the GC, prefixed with an arena record.
-function make_gc_arena!(::Type{BodegaArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{BodegaArena} where T
-    current_ptr = start_ptr + sizeof(BodegaArena)
-
-    # Set up some shelf chunk arrays
-    shelf_records = []
-    for chunk_size in [32, 64]
-        capacity = 2048
-        shelf_chunk_array = Base.unsafe_convert(Ptr{Ptr{UInt8}}, current_ptr)
-        current_ptr += capacity * sizeof(Ptr{UInt8})
-        push!(shelf_records, BodegaShelf(Csize_t(chunk_size), capacity, capacity, shelf_chunk_array))
-    end
-
-    # Set up the shelves.
-    shelf_array = Base.unsafe_convert(Ptr{BodegaShelf}, current_ptr)
-    for record in shelf_records
-        shelf = Base.unsafe_convert(Ptr{BodegaShelf}, current_ptr)
-        current_ptr += sizeof(BodegaShelf)
-        unsafe_store!(shelf, record)
-    end
-
-    # Set up a free list entry.
-    first_entry_ptr = make_gc_block!(current_ptr, Csize_t(start_ptr + size) - Csize_t(current_ptr))
-
-    # Set up the arena record.
-    arena = Base.unsafe_convert(Ptr{BodegaArena}, start_ptr)
-    unsafe_store!(
-        arena,
-        BodegaArena(
-            length(shelf_records),
-            shelf_array,
-            true,
-            FreeListArena(0, first_entry_ptr, C_NULL)))
-
-    # Stock the shelves.
-    for record in shelf_records
-        restock_shelf(arena, get_shelf(arena, record.chunk_size))
-    end
-
-    arena
-end
-
-# Takes a zero-filled region of memory and turns it into a ScatterAlloc
-# superblock.
-function make_gc_superblock!(
-    start_ptr::Ptr{T},
-    size::Csize_t;
-    page_size::UInt32 = UInt32(2048),
-    pages_per_region::UInt32 = UInt32(16))::Ptr{ScatterAllocSuperblock} where T
-
-    region_size = region_bytesize(pages_per_region, page_size)
-
-    # Figure out how many regions we can allocate.
-    region_count = div(size - header_size(ScatterAllocSuperblock), region_size)
-
-    # At this point, we'd normally allocate regions and pages.
-    # However, region and page headers are zero-initialized by default.
-    # So we don't actually need to do anything to set up the regions
-    # and pages.
-
-    # Allocate the superblock header.
-    superblock = Base.unsafe_convert(Ptr{ScatterAllocSuperblock}, align_upward(start_ptr))
-    unsafe_store!(
-        superblock,
-        ScatterAllocSuperblock(region_count, pages_per_region, page_size, C_NULL))
-
-    superblock
-end
-
-# Takes a zero-filled region of memory and turns it into an arena
-# managed by the GC, prefixed with an arena record.
-function make_gc_arena!(::Type{ScatterAllocArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{ScatterAllocArena} where T
-    superblock_ptr = align_upward(start_ptr + sizeof(ScatterAllocArena))
-    superblock = make_gc_superblock!(superblock_ptr, Csize_t(start_ptr) + size - Csize_t(superblock_ptr))
-    arena = Base.unsafe_convert(Ptr{ScatterAllocArena}, start_ptr)
-    unsafe_store!(
-        arena,
-        ScatterAllocArena(superblock))
-
-    arena
-end
-
 # Tells if a GC heap contains a particular pointer.
 function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T
     for region in heap.regions
@@ -1476,33 +922,6 @@ function iterate_allocated(fun::Function, arena::Ptr{FreeListArena})
     iterate_allocation_records(fun, allocation_list_head)
 end
 
-# Composes a set that contains all data addresses of chunks that
-# are on the shelves.
-function chunks_on_shelves(arena::Ptr{BodegaArena})
-    arena_data = unsafe_load(arena)
-    chunks_on_shelves = Set{Ptr{UInt8}}()
-    for i in 1:arena_data.shelf_count
-        shelf = unsafe_load(arena_data.shelves, i)
-        for j in shelf.chunk_finger:(shelf.capacity - 1)
-            push!(chunks_on_shelves, unsafe_load(shelf.chunks, j))
-        end
-    end
-    return chunks_on_shelves
-end
-
-# Iterates through all active allocation records in a GC arena.
-function iterate_allocated(fun::Function, arena::Ptr{BodegaArena})
-    shelf_chunks = chunks_on_shelves(arena)
-
-    # Now iterate through the allocation list, ignoring records that have
-    # been placed on the shelves.
-    iterate_allocated(get_free_list(arena)) do record
-        if !(data_pointer(record) in shelf_chunks)
-            fun(record)
-        end
-    end
-end
-
 # Iterates through all free allocation records in a GC arena.
 function iterate_free(fun::Function, arena::Ptr{FreeListArena})
     free_list_head = unsafe_load(arena).free_list_head
@@ -1544,22 +963,6 @@ function gc_free_garbage(arena::Ptr{FreeListArena}, live_blocks::Set{Ptr{FreeLis
     end
 end
 
-# Frees all dead blocks in an arena.
-function gc_free_garbage(arena::Ptr{BodegaArena}, live_blocks::Set{Ptr{FreeListRecord}})
-    # Mark chunks on shelves as live.
-    all_live_blocks = Set{Ptr{FreeListRecord}}(live_blocks)
-    shelf_chunks = chunks_on_shelves(arena)
-    for chunk_ptr in shelf_chunks
-        push!(all_live_blocks, record_pointer(chunk_ptr))
-    end
-
-    # Free garbage in the free list sub-arena.
-    gc_free_garbage(get_free_list(arena), all_live_blocks)
-
-    # Mark the arena as ready for restocking.
-    unsafe_store!(@get_field_pointer(arena, :can_restock), true)
-end
-
 # Compact a GC arena's free list. This function will
 #   1. merge adjancent free blocks, and
 #   2. reorder free blocks to put small blocks at the front
@@ -1609,31 +1012,6 @@ function gc_compact(arena::Ptr{FreeListArena})::Csize_t
     return sum(map(record -> unsafe_load(record).size, records))
 end
 
-# Compact a GC arena's free list. This function will
-#   1. merge adjancent free blocks, and
-#   2. reorder free blocks to put small blocks at the front
-#      of the free list,
-#   3. tally the total number of free bytes and return that number.
-function gc_compact(arena::Ptr{BodegaArena})::Csize_t
-    # Compact the free list.
-    tally = gc_compact(get_free_list(arena))
-
-    # Add the size of the chunks on shelves to the tally.
-    shelf_count = unsafe_load(@get_field_pointer(arena, :shelf_count))
-    for i in 1:shelf_count
-        shelf_array = unsafe_load(@get_field_pointer(arena, :shelves))
-        shelf_data = unsafe_load(shelf_array, i)
-
-        finger = shelf_data.chunk_finger
-        if finger > shelf_data.capacity
-            finger = shelf_data.capacity
-        end
-        tally += shelf_data.chunk_size * (shelf_data.capacity - finger)
-    end
-
-    tally
-end
-
 # Expands a GC arena by assigning it an additional heap region.
 function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion)
     extra_record = make_gc_block!(pointer(region), Csize_t(sizeof(region)))
@@ -1644,11 +1022,6 @@ function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion)
     unsafe_store!(last_free_list_ptr, extra_record)
 end
 
-# Expands a GC arena by assigning it an additional heap region.
-function gc_expand(arena::Ptr{BodegaArena}, region::GCHeapRegion)
-    gc_expand(get_free_list(arena), region)
-end
-
 """A report of the GC's actions."""
 mutable struct GCReport
     """The total wall-clock time of a kernel execution."""

From 2c058c74b3f850c047223816fc0a6a0d35b13e75 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Tue, 11 Jun 2019 17:50:30 +0200
Subject: [PATCH 138/146] Remove binary tree example

---
 examples/binary-tree.jl | 176 ----------------------------------------
 1 file changed, 176 deletions(-)
 delete mode 100644 examples/binary-tree.jl

diff --git a/examples/binary-tree.jl b/examples/binary-tree.jl
deleted file mode 100644
index 812af535..00000000
--- a/examples/binary-tree.jl
+++ /dev/null
@@ -1,176 +0,0 @@
-using CUDAdrv, CUDAnative
-using Random, Test
-import Base: haskey, insert!
-
-# This example defines a kernel that constructs a binary search
-# tree for a set of numbers and then proceeds to test membership
-# in that tree for a sequence of other numbers.
-#
-# The main point of this example is to demonstrate that even
-# naive, pointer-chasing programs can be compiled to GPU kernels.
-
-const use_gc = false
-
-"""A binary search tree node."""
-abstract type BinarySearchTreeNode{T} end
-
-"""An internal node of a binary search tree."""
-mutable struct InternalNode{T} <: BinarySearchTreeNode{T}
-    value::T
-    left::BinarySearchTreeNode{T}
-    right::BinarySearchTreeNode{T}
-end
-
-InternalNode{T}(value::T) where T = InternalNode{T}(value, LeafNode{T}(), LeafNode{T}())
-
-"""A leaf node of a binary search tree."""
-mutable struct LeafNode{T} <: BinarySearchTreeNode{T} end
-
-"""A binary search tree data structure."""
-mutable struct BinarySearchTree{T}
-    root::BinarySearchTreeNode{T}
-end
-
-"""Creates an empty binary search tree."""
-BinarySearchTree{T}() where T = BinarySearchTree{T}(LeafNode{T}())
-
-"""Tells if a binary search tree contains a particular element."""
-function haskey(tree::BinarySearchTree{T}, value::T)::Bool where T
-    walk = tree.root
-    while isa(walk, InternalNode{T})
-        if walk.value == value
-            return true
-        elseif walk.value > value
-            walk = walk.right
-        else
-            walk = walk.left
-        end
-    end
-    return false
-end
-
-"""Inserts an element into a binary search tree."""
-function insert!(tree::BinarySearchTree{T}, value::T) where T
-    if !isa(tree.root, InternalNode{T})
-        tree.root = InternalNode{T}(value)
-        return
-    end
-
-    walk = tree.root::InternalNode{T}
-    while true
-        if walk.value == value
-            return
-        elseif walk.value > value
-            right = walk.right
-            if isa(right, InternalNode{T})
-                walk = right
-            else
-                walk.right = InternalNode{T}(value)
-                return
-            end
-        else
-            left = walk.left
-            if isa(left, InternalNode{T})
-                walk = left
-            else
-                walk.left = InternalNode{T}(value)
-                return
-            end
-        end
-    end
-end
-
-"""
-Creates a binary search tree that contains elements copied from a device array.
-"""
-function BinarySearchTree{T}(elements::CUDAnative.DevicePtr{T}, size::Integer) where T
-    tree = BinarySearchTree{T}()
-    for i in 1:size
-        insert!(tree, unsafe_load(elements, i))
-    end
-    tree
-end
-
-"""
-Creates a binary search tree that contains elements copied from an array.
-"""
-function BinarySearchTree{T}(elements::Array{T}) where T
-    tree = BinarySearchTree{T}()
-    for i in 1:length(elements)
-        insert!(tree, elements[i])
-    end
-    tree
-end
-
-# Gets a sequence of Fibonacci numbers.
-function fibonacci(::Type{T}, count::Integer)::Array{T} where T
-    if count == 0
-        return []
-    elseif count == 1
-        return [one(T)]
-    end
-
-    results = [one(T), one(T)]
-    for i in 1:(count - 2)
-        push!(results, results[length(results) - 1] + results[length(results)])
-    end
-    return results
-end
-
-const number_count = 200
-const thread_count = 64
-const tests_per_thread = 2000
-
-# Define a kernel that copies values using a temporary buffer.
-function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64})
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    tree = BinarySearchTree{Int64}(a, number_count)
-
-    for j in 1:tests_per_thread
-        offset = (i - 1) * tests_per_thread
-        index = offset + j
-        unsafe_store!(b, haskey(tree, unsafe_load(b, index)), index)
-    end
-
-    return
-end
-
-ccall((:ha_init_bytes, "/media/jonathan/Quark/School/CUDAnative.jl/libhalloc"), Cvoid, (Csize_t,), Csize_t(256 * 1024 * 1024))
-
-# Generate a sequence of 64-bit truncated Fibonacci numbers.
-number_set = fibonacci(Int64, number_count)
-# Randomize the sequence's order.
-shuffle!(number_set)
-
-# Generate numbers for which we will test membership in the sequence.
-test_sequence = Array(1:(thread_count * tests_per_thread))
-
-# Allocate two arrays.
-source_array = Mem.alloc(Int64, length(number_set))
-destination_array = Mem.alloc(Int64, length(test_sequence))
-source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
-destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
-
-# Fill the source and destination arrays.
-Mem.upload!(source_array, number_set)
-Mem.upload!(destination_array, test_sequence)
-
-if use_gc
-    # Run the kernel.
-    @cuda gc=true malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer)
-
-    # Run it again.
-    Mem.upload!(destination_array, test_sequence)
-    stats = @cuda gc=true malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer)
-else
-    # Run the kernel.
-    @cuda malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer)
-
-    # Run it again and time it this time.
-    Mem.upload!(destination_array, test_sequence)
-    stats = CUDAdrv.@elapsed @cuda malloc="_Z8hamallocm" threads=thread_count kernel(source_pointer, destination_pointer)
-end
-println(stats)
-
-@test Mem.download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence])

From 2f4f77333f3c885a7fd940feb0ba071d7bb9713b Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Wed, 12 Jun 2019 13:45:23 +0200
Subject: [PATCH 139/146] Update GC benchmark runner

---
 gc-benchmarks/array-features.jl |  2 +-
 gc-benchmarks/run-all.jl        | 46 +++++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl
index c27a876a..045d52bc 100644
--- a/gc-benchmarks/array-features.jl
+++ b/gc-benchmarks/array-features.jl
@@ -91,7 +91,7 @@ end
 
 function kernel(destination)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    for j in 1:3
+    for j in 1:2
         unsafe_store!(destination, manipulate_array(), i)
     end
     return
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index eb17da9b..43fbac42 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -24,7 +24,7 @@ gc_tags = [t for t in benchmark_tags if startswith(t, "gc")]
 # Also write them to a CSV for further analysis.
 open("strategies.csv", "w") do file
     write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n")
-    for key in sort([k for k in keys(results)])
+    for key in sort(collect(keys(results)))
         runs = results[key]
         median_times = BenchmarkTools.median(runs)
         gc_time = median_times["gc"].time / 1e6
@@ -43,7 +43,7 @@ open("gc-heap-sizes.csv", "w") do file
     write(file, "benchmark,$(join(gc_tags, ',')),$(join(ratio_tags, ','))\n")
     all_times = [[] for t in gc_tags]
     all_normalized_times = [[] for t in gc_tags]
-    for key in sort([k for k in keys(results)])
+    for key in sort(collect(keys(results)))
         runs = results[key]
         median_times = BenchmarkTools.median(runs)
         times = [median_times[t].time / 1e6 for t in gc_tags]
@@ -58,3 +58,45 @@ open("gc-heap-sizes.csv", "w") do file
     end
     write(file, "mean,$(join(map(mean, all_times), ',')),$(join(map(mean, all_normalized_times), ','))\n")
 end
+
+open("gc-heap-sizes-summary.csv", "w") do file
+    write(file, "heap,mean-opt,mean-shared\n")
+    shared = Dict()
+    sizes = Dict()
+    for tag in gc_tags
+        shared[tag] = false
+        sizes[tag] = 60.0
+        for part in split(tag, "-")
+            if endswith(part, "mb")
+                sizes[tag] = parse(Float64, part[1:end - 2])
+            elseif part == "shared"
+                shared[tag] = true
+            end
+        end
+    end
+
+    all_normalized_times = [[] for t in gc_tags]
+    for key in sort(collect(keys(results)))
+        runs = results[key]
+        median_times = BenchmarkTools.median(runs)
+        normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags]
+        for (l, val) in zip(all_normalized_times, normalized_times)
+            push!(l, val)
+        end
+    end
+
+    unique_sizes = sort(unique(values(sizes)))
+    data = zeros(Float64, (2, length(unique_sizes)))
+    for (tag, vals) in zip(gc_tags, all_normalized_times)
+        if shared[tag]
+            shared_index = 2
+        else
+            shared_index = 1
+        end
+        size_index = indexin(sizes[tag], unique_sizes)[1]
+        data[shared_index, size_index] = mean(vals)
+    end
+    for i in 1:length(unique_sizes)
+        write(file, "$(unique_sizes[i]),$(data[1, i]),$(data[2, i])\n")
+    end
+end

From 350f0ed34527729bad5b467311c668f03559343e Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 14 Jun 2019 11:55:16 +0200
Subject: [PATCH 140/146] Tweak benchmarks

---
 gc-benchmarks/array-expansion.jl |  2 +-
 gc-benchmarks/run-all.jl         | 18 ++++++--------
 gc-benchmarks/utils.jl           | 41 +++++++++++++++++++++++---------
 3 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/gc-benchmarks/array-expansion.jl b/gc-benchmarks/array-expansion.jl
index 76abf14a..f7b43075 100644
--- a/gc-benchmarks/array-expansion.jl
+++ b/gc-benchmarks/array-expansion.jl
@@ -7,7 +7,7 @@ using CUDAdrv, CUDAnative
 
 const thread_count = 256
 const array_length = 200
-const runs = 10
+const runs = 5
 
 function iterative_sum(elements::Array{T})::T where T
     result = zero(T)
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 43fbac42..60822434 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -4,7 +4,6 @@ include("utils.jl")
 
 include("array-expansion.jl")
 include("array-features.jl")
-include("array-reduction.jl")
 include("arrays.jl")
 include("binary-tree.jl")
 include("bitvector.jl")
@@ -26,11 +25,10 @@ open("strategies.csv", "w") do file
     write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n")
     for key in sort(collect(keys(results)))
         runs = results[key]
-        median_times = BenchmarkTools.median(runs)
-        gc_time = median_times["gc"].time / 1e6
-        gc_shared_time = median_times["gc-shared"].time / 1e6
-        nogc_time = median_times["nogc"].time / 1e6
-        bump_time = median_times["bump"].time / 1e6
+        gc_time = runs["gc"] / 1e6
+        gc_shared_time = runs["gc-shared"] / 1e6
+        nogc_time = runs["nogc"] / 1e6
+        bump_time = runs["bump"] / 1e6
         gc_ratio = gc_time / nogc_time
         gc_shared_ratio = gc_shared_time / nogc_time
         bump_ratio = bump_time / nogc_time
@@ -45,12 +43,11 @@ open("gc-heap-sizes.csv", "w") do file
     all_normalized_times = [[] for t in gc_tags]
     for key in sort(collect(keys(results)))
         runs = results[key]
-        median_times = BenchmarkTools.median(runs)
-        times = [median_times[t].time / 1e6 for t in gc_tags]
+        times = [runs[t] / 1e6 for t in gc_tags]
         for (l, val) in zip(all_times, times)
             push!(l, val)
         end
-        normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags]
+        normalized_times = [runs[t] / runs["gc"] for t in gc_tags]
         for (l, val) in zip(all_normalized_times, normalized_times)
             push!(l, val)
         end
@@ -78,8 +75,7 @@ open("gc-heap-sizes-summary.csv", "w") do file
     all_normalized_times = [[] for t in gc_tags]
     for key in sort(collect(keys(results)))
         runs = results[key]
-        median_times = BenchmarkTools.median(runs)
-        normalized_times = [median_times[t].time / median_times["gc"].time for t in gc_tags]
+        normalized_times = [runs[t] / runs["gc"] for t in gc_tags]
         for (l, val) in zip(all_normalized_times, normalized_times)
             push!(l, val)
         end
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 6ceca63d..4598c743 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -1,4 +1,4 @@
-import BenchmarkTools
+import BenchmarkTools, JSON
 
 function get_gc_mode()
     try
@@ -59,25 +59,26 @@ macro cuda_sync(args...)
     end)
 end
 
-suite = BenchmarkTools.BenchmarkGroup()
+suites = Dict()
 
 function register_cuda_benchmark(f, name, config)
-    suite[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1
+    suites[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90
 end
 
 const MiB = 1 << 20
 
 benchmark_tags = [
     "gc", "gc-shared",
+    "gc-45mb", "gc-shared-45mb",
     "gc-30mb", "gc-shared-30mb",
     "gc-15mb", "gc-shared-15mb",
-    "gc-7.5mb", "gc-shared-7.5mb",
-    "gc-3.75mb", "gc-shared-3.75mb",
+    "gc-10mb", "gc-shared-10mb",
     "nogc", "bump"
 ]
 
 macro cuda_benchmark(name, ex)
     esc(quote
+        local suite = BenchmarkTools.BenchmarkGroup()
         local function register_gc_shared(config, heap_size)
             register_cuda_benchmark($name, config) do
                 global gc_mode = "gc"
@@ -98,17 +99,17 @@ macro cuda_benchmark(name, ex)
             end
         end
 
-        suite[$name] = BenchmarkTools.BenchmarkGroup(benchmark_tags)
+        suites[$name] = BenchmarkTools.BenchmarkGroup(benchmark_tags)
         register_gc("gc", 60 * MiB)
         register_gc_shared("gc-shared", 60 * MiB)
+        register_gc("gc-45mb", 45 * MiB)
+        register_gc_shared("gc-shared-45mb", 45 * MiB)
         register_gc("gc-30mb", 30 * MiB)
         register_gc_shared("gc-shared-30mb", 30 * MiB)
         register_gc("gc-15mb", 15 * MiB)
         register_gc_shared("gc-shared-15mb", 15 * MiB)
-        register_gc("gc-7.5mb", div(15 * MiB, 2))
-        register_gc_shared("gc-shared-7.5mb", div(15 * MiB, 2))
-        register_gc("gc-3.75mb", div(15 * MiB, 4))
-        register_gc_shared("gc-shared-3.75mb", div(15 * MiB, 4))
+        register_gc("gc-10mb", 10 * MiB)
+        register_gc_shared("gc-shared-10mb", 10 * MiB)
         register_cuda_benchmark($name, "nogc") do
             global gc_mode = "nogc"
             $(ex)
@@ -121,7 +122,25 @@ macro cuda_benchmark(name, ex)
 end
 
 function run_benchmarks()
-    BenchmarkTools.run(suite)
+    cache_dir = mkpath("gc-benchmarks/results-cache")
+    results = Dict()
+    for (name, group) in pairs(suites)
+        cache_path = "$cache_dir/$(replace(name, " " => "-")).json"
+        if isfile(cache_path)
+            group_results = open(cache_path, "r") do file
+                JSON.parse(file)
+            end
+        else
+            runs = BenchmarkTools.run(group)
+            median_times = BenchmarkTools.median(runs)
+            group_results = Dict(k => r.time for (k, r) in pairs(median_times))
+            open(cache_path, "w") do file
+                JSON.print(file, group_results)
+            end
+        end
+        results[name] = group_results
+    end
+    return results
 end
 
 module CUDArandom

From 93a2f57a93c49f8a75d8ae073437fa6236ba7217 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 14 Jun 2019 16:56:03 +0200
Subject: [PATCH 141/146] Add a mean to 'strategies.csv' too

---
 strategies.csv | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 strategies.csv

diff --git a/strategies.csv b/strategies.csv
new file mode 100644
index 00000000..baa41e09
--- /dev/null
+++ b/strategies.csv
@@ -0,0 +1,13 @@
+benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio
+array expansion,517.36013,145.260881,395.999929,11.459003,1.0,0.28077324203548504,0.7654241330888795,0.02214898739877771
+array features,236.641882,134.679289,330.545038,18.5354245,1.0,0.5691270195357896,1.3968154546708682,0.07832689777205204
+arrays,5001.1374265,888.7860235,1356.95648,4.4352555,1.0,0.17771677674572697,0.2713295725107985,0.0008868493548084666
+binary tree,1993.06871,571.320489,950.1915835,33.504988,1.0,0.28665368440810046,0.4767480311805206,0.016810754105913386
+bitvector,3095.321124,690.0682245,3010.889644,25.915606,1.0,0.22293913841425392,0.9727228689309975,0.008372509656287282
+genetic algo,274.7332775,173.877077,936.568618,4.434724,1.0,0.6328941240108782,3.4090104647042625,0.01614192514410636
+linked list,3983.6005275,712.9961405,711.111524,4.4301,1.0,0.1789828411704366,0.17850974742346326,0.0011120843993813333
+matrix,52.3009975,152.4938,157.114092,36.3442955,1.0,2.9156958239658812,3.004036242329795,0.6949063543195328
+ssa opt,682.361637,238.392158,1425.6750165,4.306012,1.0,0.3493633655140551,2.0893246911827785,0.006310454407916839
+static arrays,454.761664,131.938243,180.566625,6.793038,1.0,0.29012613297148987,0.3970577102119144,0.014937578379517936
+stream queries,5138.209838,578.285759,4061.109831,4.434645,1.0,0.11254615463993824,0.7903744609583226,0.0008630719919617265
+mean,1948.1361103636364,401.64528040909096,1228.7934891818181,14.053917409090907,1.0,0.5469834821283668,1.2501230342902363,0.07825613335729599

From 27822446c87d4a0efa475393fd73a44c4a2a6f76 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 14 Jun 2019 17:07:07 +0200
Subject: [PATCH 142/146] Remove strategies.csv from root dir

---
 gc-benchmarks/run-all.jl | 14 +++++++++++++-
 strategies.csv           | 13 -------------
 2 files changed, 13 insertions(+), 14 deletions(-)
 delete mode 100644 strategies.csv

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 60822434..185af2d7 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -23,6 +23,17 @@ gc_tags = [t for t in benchmark_tags if startswith(t, "gc")]
 # Also write them to a CSV for further analysis.
 open("strategies.csv", "w") do file
     write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n")
+    all_results = []
+    function write_line(key, results)
+        if length(all_results) == 0
+            all_results = [Float64[] for _ in results]
+        end
+        write(file, "$key,$(join(results, ','))\n")
+        for (l, val) in zip(all_results, results)
+            push!(l, val)
+        end
+    end
+
     for key in sort(collect(keys(results)))
         runs = results[key]
         gc_time = runs["gc"] / 1e6
@@ -32,8 +43,9 @@ open("strategies.csv", "w") do file
         gc_ratio = gc_time / nogc_time
         gc_shared_ratio = gc_shared_time / nogc_time
         bump_ratio = bump_time / nogc_time
-        write(file, "$key,$nogc_time,$gc_time,$gc_shared_time,$bump_time,1,$gc_ratio,$gc_shared_ratio,$bump_ratio\n")
+        write_line(key, [nogc_time, gc_time, gc_shared_time, bump_time, 1.0, gc_ratio, gc_shared_ratio, bump_ratio])
     end
+    write_line("mean", mean.(all_results))
 end
 
 open("gc-heap-sizes.csv", "w") do file
diff --git a/strategies.csv b/strategies.csv
deleted file mode 100644
index baa41e09..00000000
--- a/strategies.csv
+++ /dev/null
@@ -1,13 +0,0 @@
-benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio
-array expansion,517.36013,145.260881,395.999929,11.459003,1.0,0.28077324203548504,0.7654241330888795,0.02214898739877771
-array features,236.641882,134.679289,330.545038,18.5354245,1.0,0.5691270195357896,1.3968154546708682,0.07832689777205204
-arrays,5001.1374265,888.7860235,1356.95648,4.4352555,1.0,0.17771677674572697,0.2713295725107985,0.0008868493548084666
-binary tree,1993.06871,571.320489,950.1915835,33.504988,1.0,0.28665368440810046,0.4767480311805206,0.016810754105913386
-bitvector,3095.321124,690.0682245,3010.889644,25.915606,1.0,0.22293913841425392,0.9727228689309975,0.008372509656287282
-genetic algo,274.7332775,173.877077,936.568618,4.434724,1.0,0.6328941240108782,3.4090104647042625,0.01614192514410636
-linked list,3983.6005275,712.9961405,711.111524,4.4301,1.0,0.1789828411704366,0.17850974742346326,0.0011120843993813333
-matrix,52.3009975,152.4938,157.114092,36.3442955,1.0,2.9156958239658812,3.004036242329795,0.6949063543195328
-ssa opt,682.361637,238.392158,1425.6750165,4.306012,1.0,0.3493633655140551,2.0893246911827785,0.006310454407916839
-static arrays,454.761664,131.938243,180.566625,6.793038,1.0,0.29012613297148987,0.3970577102119144,0.014937578379517936
-stream queries,5138.209838,578.285759,4061.109831,4.434645,1.0,0.11254615463993824,0.7903744609583226,0.0008630719919617265
-mean,1948.1361103636364,401.64528040909096,1228.7934891818181,14.053917409090907,1.0,0.5469834821283668,1.2501230342902363,0.07825613335729599

From 7380683bcfefa8d73fac230b3df80f3381f18146 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 17 Jun 2019 10:14:15 +0200
Subject: [PATCH 143/146] Include array reduction benchmark in GC benchmark
 suite

---
 gc-benchmarks/run-all.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 185af2d7..6d4b3c4d 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -4,6 +4,7 @@ include("utils.jl")
 
 include("array-expansion.jl")
 include("array-features.jl")
+include("array-reduction.jl")
 include("arrays.jl")
 include("binary-tree.jl")
 include("bitvector.jl")

From c6390edda89caad1b774eaad4af3a7f3b00530a4 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Sat, 22 Jun 2019 16:15:05 +0200
Subject: [PATCH 144/146] Insert a root buffer overflow check

---
 src/gc.jl | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/gc.jl b/src/gc.jl
index 1fce27c4..0564097b 100644
--- a/src/gc.jl
+++ b/src/gc.jl
@@ -264,11 +264,21 @@ Registers a GC frame with the garbage collector.
 @inline function push_gc_frame(gc_frame::GCFrame, size::UInt32)
     master_record = get_gc_master_record()
 
+    threadid = get_thread_id()
+    next_rootbuf_start = master_record.root_buffers + threadid * master_record.root_buffer_capacity * sizeof(Ptr{ObjectRef})
+    new_rootbuf_finger = gc_frame + size * sizeof(ObjectRef)
+
+    # Check that we have enough room to push the GC frame.
+    if new_rootbuf_finger >= next_rootbuf_start
+        @cuprintf("Root buffer overflow in thread %ld.\n", threadid)
+        return
+    end
+
     # Update the root buffer tip.
     unsafe_store!(
         master_record.root_buffer_fingers,
-        gc_frame + size * sizeof(ObjectRef),
-        get_thread_id())
+        new_rootbuf_finger,
+        threadid)
     return
 end
 

From a91baefa9118ea07d22788037714b84904ed897d Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 5 Jul 2019 12:23:51 +0200
Subject: [PATCH 145/146] Update benchmarks with pinned memory bump allocator

---
 gc-benchmarks/run-all.jl |  6 ++++--
 gc-benchmarks/utils.jl   | 14 +++++++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
index 6d4b3c4d..359d80bc 100644
--- a/gc-benchmarks/run-all.jl
+++ b/gc-benchmarks/run-all.jl
@@ -23,7 +23,7 @@ gc_tags = [t for t in benchmark_tags if startswith(t, "gc")]
 
 # Also write them to a CSV for further analysis.
 open("strategies.csv", "w") do file
-    write(file, "benchmark,nogc,gc,gc-shared,bump,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio\n")
+    write(file, "benchmark,nogc,gc,gc-shared,bump,bump-pinned,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio,bump-pinned-ratio\n")
     all_results = []
     function write_line(key, results)
         if length(all_results) == 0
@@ -41,10 +41,12 @@ open("strategies.csv", "w") do file
         gc_shared_time = runs["gc-shared"] / 1e6
         nogc_time = runs["nogc"] / 1e6
         bump_time = runs["bump"] / 1e6
+        bump_pinned_time = runs["bump-pinned"] / 1e6
         gc_ratio = gc_time / nogc_time
         gc_shared_ratio = gc_shared_time / nogc_time
         bump_ratio = bump_time / nogc_time
-        write_line(key, [nogc_time, gc_time, gc_shared_time, bump_time, 1.0, gc_ratio, gc_shared_ratio, bump_ratio])
+        bump_pinned_ratio = bump_pinned_time / nogc_time
+        write_line(key, [nogc_time, gc_time, gc_shared_time, bump_time, bump_pinned_time, 1.0, gc_ratio, gc_shared_ratio, bump_ratio, bump_pinned_ratio])
     end
     write_line("mean", mean.(all_results))
 end
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 4598c743..89c30271 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -44,9 +44,13 @@ macro cuda_sync(args...)
         local mode = get_gc_mode()
         if mode == "gc"
             CUDAnative.@cuda gc=true gc_config=gc_config $(args...)
-        elseif mode == "bump"
+        elseif startswith(mode, "bump")
             local capacity = 60 * MiB
-            local buf = Mem.alloc(Mem.DeviceBuffer, capacity)
+            if mode == "bump"
+                local buf = Mem.alloc(Mem.DeviceBuffer, capacity)
+            else
+                local buf = Mem.alloc(Mem.HostBuffer, capacity)
+            end
             local start_address = pointer(buf)
             local function init(kernel)
                 CUDAnative.Runtime.bump_alloc_init!(kernel, start_address, capacity)
@@ -73,7 +77,7 @@ benchmark_tags = [
     "gc-30mb", "gc-shared-30mb",
     "gc-15mb", "gc-shared-15mb",
     "gc-10mb", "gc-shared-10mb",
-    "nogc", "bump"
+    "nogc", "bump", "bump-pinned"
 ]
 
 macro cuda_benchmark(name, ex)
@@ -118,6 +122,10 @@ macro cuda_benchmark(name, ex)
             global gc_mode = "bump"
             $(ex)
         end
+        register_cuda_benchmark($name, "bump-pinned") do
+            global gc_mode = "bump-pinned"
+            $(ex)
+        end
     end)
 end
 

From 4b76aec1a559157e5d55a1838e61d66dfe8ebb8e Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Sat, 6 Jul 2019 19:30:42 +0200
Subject: [PATCH 146/146] Write breakdown-computing code

---
 gc-benchmarks/run-breakdown.jl | 108 +++++++++++++++++++++++++++++++++
 gc-benchmarks/utils-common.jl  |  66 ++++++++++++++++++++
 gc-benchmarks/utils.jl         |  71 +---------------------
 3 files changed, 176 insertions(+), 69 deletions(-)
 create mode 100644 gc-benchmarks/run-breakdown.jl
 create mode 100644 gc-benchmarks/utils-common.jl

diff --git a/gc-benchmarks/run-breakdown.jl b/gc-benchmarks/run-breakdown.jl
new file mode 100644
index 00000000..1d1bd5b9
--- /dev/null
+++ b/gc-benchmarks/run-breakdown.jl
@@ -0,0 +1,108 @@
+using CUDAdrv, CUDAnative, Test, Statistics, JSON
+
+include("utils-common.jl")
+
+const benchmarks = Dict()
+global benchmark_results = Dict()
+global current_benchmark = nothing
+
+macro cuda_sync(args...)
+    esc(quote
+        local heap_size = 10 * MiB
+        local local_arena_initial_size = div(heap_size, 10)
+        local global_arena_initial_size = heap_size - 8 * local_arena_initial_size
+        local gc_config = GCConfiguration(
+            local_arena_count=8,
+            local_arena_initial_size=local_arena_initial_size,
+            global_arena_initial_size=global_arena_initial_size)
+        local result = CUDAnative.@cuda gc=true gc_config=gc_config $(args...)
+        push!(benchmark_results[current_benchmark], result)
+    end)
+end
+
+macro cuda_benchmark(name, ex)
+    esc(quote
+        benchmarks[$name] = (() -> $(ex))
+    end)
+end
+
+include("array-expansion.jl")
+include("array-features.jl")
+include("array-reduction.jl")
+include("arrays.jl")
+include("binary-tree.jl")
+include("bitvector.jl")
+include("linked-list.jl")
+include("matrix.jl")
+include("ssa-opt.jl")
+include("static-arrays.jl")
+include("stream-queries.jl")
+include("genetic-algorithm.jl")
+
+function run_benchmarks()
+    cache_dir = mkpath("gc-benchmarks/breakdown-cache")
+    global benchmark_results = Dict()
+    results = Dict()
+    for (k, v) in pairs(benchmarks)
+        println(k)
+        cache_path = "$cache_dir/$(replace(k, " " => "-")).json"
+        if isfile(cache_path)
+            results[k] = open(cache_path, "r") do file
+                JSON.parse(file)
+            end
+        else
+            # Perform a dry run to ensure that compilations are cached.
+            global current_benchmark = k
+            benchmark_results[k] = []
+            v()
+
+            # Run the benchmarks for real.
+            benchmark_results[k] = []
+            v()
+            while sum(map(x -> x.elapsed_time, benchmark_results[k])) < 90
+                v()
+            end
+
+            results[k] = [
+                Dict(
+                    "elapsed-time" => r.elapsed_time,
+                    "collection-count" => r.collection_count,
+                    "collection-poll-time" => r.collection_poll_time,
+                    "collection-time" => r.collection_time)
+                for (k, r) in pairs(benchmark_results[k])]
+
+            open(cache_path, "w") do file
+                JSON.print(file, results[k])
+            end
+        end
+    end
+    return results
+end
+
+results = run_benchmarks()
+# Write results to a CSV file for further analysis.
+open("breakdown.csv", "w") do file
+    write(file, "benchmark,collection-poll-ratio,collection-ratio,other-ratio\n")
+    all_results = []
+    function write_line(key, results)
+        if length(all_results) == 0
+            all_results = [Float64[] for _ in results]
+        end
+        write(file, "$key,$(join(results, ','))\n")
+        for (l, val) in zip(all_results, results)
+            push!(l, val)
+        end
+    end
+
+    for key in sort(collect(keys(results)))
+        runs = results[key]
+        total_time = mean(getindex.(runs, "elapsed-time"))
+        poll_time = mean(getindex.(runs, "collection-poll-time"))
+        collection_time = mean(getindex.(runs, "collection-time"))
+        poll_ratio = poll_time / total_time
+        collection_ratio = collection_time / total_time
+        other_ratio = 1.0 - poll_ratio - collection_ratio
+        write_line(key, [poll_time, collection_ratio, other_ratio])
+    end
+    write_line("mean", mean.(all_results))
+end
diff --git a/gc-benchmarks/utils-common.jl b/gc-benchmarks/utils-common.jl
new file mode 100644
index 00000000..334ae3c3
--- /dev/null
+++ b/gc-benchmarks/utils-common.jl
@@ -0,0 +1,66 @@
+module CUDArandom
+
+# A linear congruential pseudo-random number generator.
+mutable struct LinearCongruentialGenerator
+    modulus::Int
+    a::Int
+    c::Int
+    state::Int
+end
+
+LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed)
+
+# Requests a pseudo-random number.
+function next(generator::LinearCongruentialGenerator)::Int
+    generator.state = (generator.a * generator.state + generator.c) % generator.modulus
+    generator.state
+end
+
+# Requests a pseudo-random number that is at least as great as `lower`
+# and less than `upper`.
+function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int
+    lower + next(generator) % (upper - lower)
+end
+
+end
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+const MiB = 1 << 20
+const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02
+const BENCHMARK_HEAP_SIZE = 64 * MiB
+
+function set_malloc_heap_size(size::Integer)
+    CUDAdrv.@apicall(
+        :cuCtxSetLimit,
+        (Cint, Csize_t),
+        CU_LIMIT_MALLOC_HEAP_SIZE,
+        Csize_t(size))
+end
+
+"""
+    @sync ex
+Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly
+synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As
+such, this operation is preferred over implicit synchronization (e.g. when performing a
+memory copy) for high-performance applications.
+It is also useful for timing code that executes asynchronously.
+"""
+macro sync(ex)
+    # Copied from https://github.com/JuliaGPU/CuArrays.jl/blob/8e45a27f2b12796f47683340845f98f017865676/src/utils.jl#L68-L86
+    quote
+        local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING)
+        local ret = $(esc(ex))
+        CUDAdrv.record(e)
+        CUDAdrv.synchronize(e)
+        ret
+    end
+end
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
index 89c30271..4fe2b540 100644
--- a/gc-benchmarks/utils.jl
+++ b/gc-benchmarks/utils.jl
@@ -1,5 +1,7 @@
 import BenchmarkTools, JSON
 
+include("utils-common.jl")
+
 function get_gc_mode()
     try
         return gc_mode
@@ -8,37 +10,6 @@ function get_gc_mode()
     end
 end
 
-const MiB = 1 << 20
-const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02
-const BENCHMARK_HEAP_SIZE = 64 * MiB
-
-function set_malloc_heap_size(size::Integer)
-    CUDAdrv.@apicall(
-        :cuCtxSetLimit,
-        (Cint, Csize_t),
-        CU_LIMIT_MALLOC_HEAP_SIZE,
-        Csize_t(size))
-end
-
-"""
-    @sync ex
-Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly
-synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As
-such, this operation is preferred over implicit synchronization (e.g. when performing a
-memory copy) for high-performance applications.
-It is also useful for timing code that executes asynchronously.
-"""
-macro sync(ex)
-    # Copied from https://github.com/JuliaGPU/CuArrays.jl/blob/8e45a27f2b12796f47683340845f98f017865676/src/utils.jl#L68-L86
-    quote
-        local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING)
-        local ret = $(esc(ex))
-        CUDAdrv.record(e)
-        CUDAdrv.synchronize(e)
-        ret
-    end
-end
-
 macro cuda_sync(args...)
     esc(quote
         local mode = get_gc_mode()
@@ -69,8 +40,6 @@ function register_cuda_benchmark(f, name, config)
     suites[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90
 end
 
-const MiB = 1 << 20
-
 benchmark_tags = [
     "gc", "gc-shared",
     "gc-45mb", "gc-shared-45mb",
@@ -150,39 +119,3 @@ function run_benchmarks()
     end
     return results
 end
-
-module CUDArandom
-
-# A linear congruential pseudo-random number generator.
-mutable struct LinearCongruentialGenerator
-    modulus::Int
-    a::Int
-    c::Int
-    state::Int
-end
-
-LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed)
-
-# Requests a pseudo-random number.
-function next(generator::LinearCongruentialGenerator)::Int
-    generator.state = (generator.a * generator.state + generator.c) % generator.modulus
-    generator.state
-end
-
-# Requests a pseudo-random number that is at least as great as `lower`
-# and less than `upper`.
-function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int
-    lower + next(generator) % (upper - lower)
-end
-
-end
-
-function upload!(destination, source)
-    Mem.copy!(destination, pointer(source), sizeof(source))
-end
-
-function download(::Type{T}, source, dims) where T
-    result = Array{T}(undef, dims)
-    Mem.copy!(pointer(result), source, sizeof(result))
-    result
-end