diff --git a/examples/gc.jl b/examples/gc.jl
new file mode 100644
index 00000000..4c53ecaf
--- /dev/null
+++ b/examples/gc.jl
@@ -0,0 +1,57 @@
+using CUDAdrv, CUDAnative
+using Test
+
+mutable struct TempStruct
+    data::Float32
+end
+
+@noinline function escape(val)
+    Base.pointer_from_objref(val)
+end
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+# Define a kernel that copies values using a temporary struct.
+function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    for j in 1:2
+        # Allocate a mutable struct and make sure it ends up on the GC heap.
+        temp = TempStruct(unsafe_load(a, i))
+        escape(temp)
+
+        # Allocate a large garbage buffer to force collections.
+        gc_malloc(Csize_t(256 * 1024))
+
+        # Use the mutable struct. If its memory has been reclaimed (by accident)
+        # then we expect the test at the end of this file to fail.
+        unsafe_store!(b, temp.data, i)
+    end
+
+    return
+end
+
+thread_count = 256
+
+# Allocate two arrays.
+source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
+destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
+
+# Fill the source and destination arrays.
+upload!(source_array, fill(42.f0, thread_count))
+upload!(destination_array, zeros(Float32, thread_count))
+
+# Run the kernel.
+@cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)
+
+@test download(Float32, destination_array, thread_count) == fill(42.f0, thread_count)
diff --git a/examples/interrupt-memory.jl b/examples/interrupt-memory.jl
new file mode 100644
index 00000000..631bb6ce
--- /dev/null
+++ b/examples/interrupt-memory.jl
@@ -0,0 +1,54 @@
+using CUDAdrv, CUDAnative
+using Test
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+# Define a kernel that copies some data from one array to another.
+# The host is invoked to populate the source array.
+function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    interrupt_or_wait()
+    threadfence_system()
+    Base.unsafe_store!(b, Base.unsafe_load(a, i), i)
+    return
+end
+
+thread_count = 64
+
+# Allocate two arrays.
+source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
+destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
+
+# Zero-fill the source and destination arrays.
+upload!(source_array, zeros(Float32, thread_count))
+upload!(destination_array, zeros(Float32, thread_count))
+
+# Define one stream for kernel execution and another for
+# data transfer.
+data_stream = CuStream()
+exec_stream = CuStream()
+
+# Define a magic value.
+magic = 42.f0
+
+# Configure the interrupt to fill the input array with the magic value.
+function handle_interrupt()
+    upload!(source_array, fill(magic, thread_count), data_stream; async = true)
+    synchronize(data_stream)
+end
+
+# Run the kernel.
+@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer)
+
+# Check that the destination buffer is as expected.
+@test download(Float32, destination_array, thread_count) == fill(magic, thread_count)
diff --git a/examples/interrupt.jl b/examples/interrupt.jl
new file mode 100644
index 00000000..a1c8f81e
--- /dev/null
+++ b/examples/interrupt.jl
@@ -0,0 +1,24 @@
+using CUDAdrv, CUDAnative
+using Test
+
+# Define a kernel that makes the host count.
+function kernel()
+    interrupt()
+    return
+end
+
+thread_count = 64
+
+# Configure the interrupt to increment a counter.
+global counter = 0
+function handle_interrupt()
+    global counter
+    counter += 1
+end
+
+# Run the kernel.
+@cuda_interruptible handle_interrupt threads=thread_count kernel()
+
+# Check that the counter's final value equals the number
+# of threads.
+@test counter == thread_count
diff --git a/examples/linked-list.jl b/examples/linked-list.jl
new file mode 100644
index 00000000..ecb802ac
--- /dev/null
+++ b/examples/linked-list.jl
@@ -0,0 +1,88 @@
+using CUDAnative, CUDAdrv
+using Test
+import Base: foldl, reduce, sum
+
+# This test constructs a linked list in a GPU kernel.
+
+use_gc = true
+
+abstract type List{T}
+end
+
+mutable struct Nil{T} <: List{T}
+end
+
+mutable struct Cons{T} <: List{T}
+    value::T
+    next::List{T}
+end
+
+Cons{T}(value::T) where T = Cons{T}(value, Nil{T}())
+
+function List{T}(pointer, count::Integer) where T
+    result = Nil{T}()
+    for i in count:-1:1
+        result = Cons{T}(unsafe_load(pointer, i), result)
+    end
+    result
+end
+
+function foldl(op, list::List{T}; init) where T
+    node = list
+    accumulator = init
+    while isa(node, Cons{T})
+        accumulator = op(accumulator, node.value)
+        node = node.next
+    end
+    accumulator
+end
+
+function reduce(op, list::List{T}; init) where T
+    foldl(op, list; init=init)
+end
+
+function sum(list::List{T}) where T
+    reduce(+, list; init=zero(T))
+end
+
+const element_count = 2000
+const thread_count = 32
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    l = List{Int64}(elements, element_count)
+    unsafe_store!(results, sum(l), i)
+    return
+end
+
+# Allocate two arrays.
+source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * element_count)
+destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+# Fill the source and destination arrays.
+upload!(source_array, Array(1:element_count))
+upload!(destination_array, zeros(Int64, thread_count))
+
+# Run the kernel.
+if use_gc
+    @cuda gc=true threads=thread_count gc_config=GCConfiguration(; global_arena_initial_size=1024, global_arena_starvation_threshold=1024) kernel(source_pointer, destination_pointer)
+    stats = @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)
+else
+    @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+    stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+end
+println(stats)
+
+@test download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count)
diff --git a/examples/lock.jl b/examples/lock.jl
new file mode 100644
index 00000000..8f59d100
--- /dev/null
+++ b/examples/lock.jl
@@ -0,0 +1,46 @@
+using CUDAdrv, CUDAnative
+using Test
+
+const thread_count = Int32(128)
+const total_count = Int32(1024)
+
+# Define a kernel that atomically increments a counter using a lock.
+function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.MutexState})
+    lock = Mutex(lock_state)
+    done = false
+    while !done && try_lock(lock)
+        new_count = unsafe_load(counter) + 1
+        unsafe_store!(counter, new_count)
+        if new_count == total_count
+            done = true
+        end
+        CUDAnative.unlock(lock)
+    end
+    return
+end
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+# Allocate memory for the counter and the lock.
+counter_buf = Mem.alloc(Mem.DeviceBuffer, sizeof(Int32))
+upload!(counter_buf, [Int32(0)])
+counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf)
+
+lock_buf = Mem.alloc(Mem.DeviceBuffer, sizeof(CUDAnative.MutexState))
+upload!(lock_buf, [CUDAnative.MutexState(0)])
+lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.MutexState}, lock_buf)
+
+# Run the kernel.
+@cuda threads=thread_count increment_counter(counter_pointer, lock_pointer)
+
+# Check that the counter's final value equals the number
+# of threads.
+@test download(Int32, counter_buf) == [Int32(total_count)]
diff --git a/examples/matrix.jl b/examples/matrix.jl
new file mode 100644
index 00000000..8a607103
--- /dev/null
+++ b/examples/matrix.jl
@@ -0,0 +1,133 @@
+# This example has kernels allocate dense symmetric matrices, fill them with Fibonacci numbers
+# and compute their squares. The example is designed to stress the garbage allocator, specifically
+# testing its ability to deal with many large objects. Furthermore, the example requires multiple
+# collections to run to completion, so it also tests the performance of those collections.
+
+using StaticArrays, CUDAnative, CUDAdrv
+import Base: getindex, setindex!, pointer, unsafe_convert, zeros
+
+const use_gc = true
+
+"""A fixed-size, heap-allocated array type for CUDAnative kernels."""
+struct FixedArray{T}
+    # The number of elements in the array.
+    size::Int
+
+    # A pointer to the first element in the array.
+    #
+    # TODO: maybe protect this pointer from the GC somehow?
+    # At the moment, this pointer is protected automatically
+    # because the GC is conservative rather than precise.
+    ptr::Ptr{T}
+end
+
+"""Allocates a heap-allocated array type and fills it with zeros."""
+function zeros(::Type{FixedArray{T}}, size::Int) where T
+    # Note: GC memory is always zero-initialized, so we don't
+    # actually have to fill the array with zeros.
+    bytesize = Csize_t(sizeof(T) * size)
+    buf = use_gc ? gc_malloc(bytesize) : CUDAnative.malloc(bytesize)
+    FixedArray{T}(size, unsafe_convert(Ptr{T}, buf))
+end
+
+"""Gets a pointer to the first element of a fixed-size array."""
+function pointer(array::FixedArray{T})::Ptr{T} where T
+    array.ptr
+end
+
+function getindex(array::FixedArray{T}, i::Integer)::T where T
+    # TODO: bounds checking.
+    unsafe_load(pointer(array), i)
+end
+
+function setindex!(array::FixedArray{T}, value::T, i::Integer) where T
+    # TODO: bounds checking.
+    unsafe_store!(pointer(array), value, i)
+end
+
+"""A heap-allocated matrix type, suitable for CUDAnative kernels."""
+struct Matrix{Width, Height, T}
+    data::FixedArray{T}
+end
+
+Matrix{Width, Height, T}() where {Width, Height, T} =
+    Matrix{Width, Height, T}(zeros(FixedArray{T}, Width * Height))
+
+function pointer(matrix::Matrix{Width, Height, T})::Ptr{T} where {Width, Height, T}
+    pointer(matrix.data)
+end
+
+function getindex(matrix::Matrix{Width, Height, T}, row::Int, column::Int) where {Width, Height, T}
+    getindex(matrix.data, (row - 1) * Width + column)
+end
+
+function setindex!(matrix::Matrix{Width, Height, T}, value::T, row::Int, column::Int) where {Width, Height, T}
+    setindex!(matrix.data, value, (row - 1) * Width + column)
+end
+
+const matrix_dim = 50
+const iterations = 20
+const thread_count = 256
+
+function kernel(result::CUDAnative.DevicePtr{Int64})
+    thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    accumulator = 0
+
+    for _ in 1:iterations
+        # Allocate a matrix.
+        matrix = Matrix{matrix_dim, matrix_dim, Int64}()
+
+        # Fill it with Fibonacci numbers.
+        penultimate = 0
+        ultimate = 1
+        for i in 1:matrix_dim
+            for j in 1:matrix_dim
+                matrix[i, j] = ultimate
+                tmp = ultimate
+                ultimate = ultimate + penultimate
+                penultimate = tmp
+            end
+        end
+
+        # Create a new element that contains the square of
+        # every element in `matrix`.
+        square = Matrix{matrix_dim, matrix_dim, Int64}()
+        for i in 1:matrix_dim
+            for j in 1:matrix_dim
+                square[i, j] = matrix[i, j] ^ 2
+            end
+        end
+
+        # Compute the sum of the squares.
+        square_sum = 0
+        for i in 1:matrix_dim
+            for j in 1:matrix_dim
+                square_sum += square[i, j]
+            end
+        end
+
+        # Add that sum to an accumulator.
+        accumulator += square_sum
+    end
+
+    # Write the accumulator to the result array.
+    unsafe_store!(result, accumulator, thread_id)
+
+    return
+end
+
+destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * thread_count)
+destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+if use_gc
+    time = @cuda gc=true threads=thread_count kernel(destination_pointer)
+    println(time)
+    time = @cuda gc=true threads=thread_count kernel(destination_pointer)
+    println(time)
+else
+    time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer)
+    println(time)
+    time = CUDAdrv.@elapsed @cuda threads=thread_count kernel(destination_pointer)
+    println(time)
+end
diff --git a/examples/stdlib-array.jl b/examples/stdlib-array.jl
new file mode 100644
index 00000000..157a468f
--- /dev/null
+++ b/examples/stdlib-array.jl
@@ -0,0 +1,20 @@
+using CUDAdrv, CUDAnative, StaticArrays
+
+# This example allocates an array in a GPU kernel.
+
+const thread_count = 64
+
+@noinline function escape(value)
+    Base.pointer_from_objref(value)
+    value
+end
+
+function kernel()
+    array = [1, 2, 3, 4, 5, 6, 7]
+    escape(array)
+    comp = [i * i for i in array]
+    escape(comp)
+    return
+end
+
+@cuda gc=true threads=thread_count kernel()
diff --git a/gc-benchmarks/array-expansion.jl b/gc-benchmarks/array-expansion.jl
new file mode 100644
index 00000000..f7b43075
--- /dev/null
+++ b/gc-benchmarks/array-expansion.jl
@@ -0,0 +1,46 @@
+module ArrayExpansion
+
+using CUDAdrv, CUDAnative
+
+# This benchmark has every thread create arrays and repeatedly
+# append elements to those arrays.
+
+const thread_count = 256
+const array_length = 200
+const runs = 5
+
+function iterative_sum(elements::Array{T})::T where T
+    result = zero(T)
+    for i in elements
+        result += i
+    end
+    return result
+end
+
+function kernel(destination)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    result = 0
+    for j in 1:runs
+        array = Int[]
+        for k in 1:array_length
+            push!(array, k)
+        end
+        result += iterative_sum(array)
+    end
+    unsafe_store!(destination, result, i)
+    return
+end
+
+end
+
+function array_expansion_benchmark()
+    destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int) * ArrayExpansion.thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int}, destination_array)
+
+    # Run the kernel.
+    @cuda_sync threads=ArrayExpansion.thread_count ArrayExpansion.kernel(destination_pointer)
+
+    @test download(Int, destination_array, ArrayExpansion.thread_count) == fill(ArrayExpansion.runs * sum(1:ArrayExpansion.array_length), ArrayExpansion.thread_count)
+end
+
+@cuda_benchmark "array expansion" array_expansion_benchmark()
diff --git a/gc-benchmarks/array-features.jl b/gc-benchmarks/array-features.jl
new file mode 100644
index 00000000..045d52bc
--- /dev/null
+++ b/gc-benchmarks/array-features.jl
@@ -0,0 +1,112 @@
+module ArrayFeatures
+
+using CUDAdrv, CUDAnative
+
+# This benchmark has every thread exercise the core low-level
+# array API.
+
+const thread_count = 256
+
+# Creates an array of Fibonacci numbers.
+function fib_array(count::Integer)
+    # Calls `jl_alloc_array_1d`.
+    result = [1, 1]
+    # Calls `jl_array_sizehint`.
+    sizehint!(result, count + 2)
+    for i in 1:count
+        # Calls `jl_array_grow_end`.
+        push!(result, result[i] + result[i + 1])
+    end
+    return result
+end
+
+function intersperse_with!(vec::Vector{T}, value::T) where T
+    for i in 1:length(vec)
+        # Calls `jl_array_grow_at`.
+        insert!(vec, i * 2, value)
+    end
+    return vec
+end
+
+function iterative_sum(array)
+    result = 0
+    for i in array
+        result += i
+    end
+    return result
+end
+
+function manipulate_array()
+    # Initialize the array as a Fibonacci sequence.
+    arr = fib_array(20)
+
+    # Intersperse the array with constants.
+    intersperse_with!(arr, 2)
+
+    # Prepend a constant to the array (calls `jl_array_grow_beg`).
+    pushfirst!(arr, 2)
+
+    # Intersperse again.
+    intersperse_with!(arr, 4)
+
+    # Delete the first element (calls `jl_array_del_beg`).
+    popfirst!(arr)
+
+    # Delete the last element (calls `jl_array_del_end`).
+    pop!(arr)
+
+    # Delete some other element (calls `jl_array_del_at`).
+    deleteat!(arr, 8)
+
+    # Create a two-dimensional array (calls `jl_alloc_array_2d`).
+    arr_2d = fill(2, (2, 2))
+
+    # Create a three-dimensional array (calls `jl_alloc_array_3d`).
+    arr_3d = fill(2, (2, 2, 2))
+
+    # Create a four-dimensional array (calls `jl_new_array`).
+    arr_4d = fill(2, (2, 2, 2, 2))
+
+    # Create an alias for the Fibonacci array (this is dangerous, but we
+    # know what we're doing here; calls `jl_ptr_to_array_1d`).
+    alias = unsafe_wrap(Array, pointer(arr), length(arr))
+
+    # Create an alias for `arr_2d` (calls `jl_ptr_to_array`).
+    alias_2d = unsafe_wrap(Array, pointer(arr_2d), size(arr_2d))
+
+    # Create an array that is similar to `arr_3d` and fill it with constants.
+    # This does not call any new low-level functions, but it does illustrate
+    # that high-level functions such as `similar` and `fill!` fully functional.
+    arr_3d_sim = similar(arr_3d)
+    fill!(arr_3d_sim, 10)
+
+    return iterative_sum(arr) +
+        iterative_sum(arr_2d) +
+        iterative_sum(arr_3d) +
+        iterative_sum(arr_4d) +
+        iterative_sum(alias) +
+        iterative_sum(alias_2d) +
+        iterative_sum(arr_3d_sim)
+end
+
+function kernel(destination)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    for j in 1:2
+        unsafe_store!(destination, manipulate_array(), i)
+    end
+    return
+end
+
+end
+
+function array_features_benchmark()
+    destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int) * ArrayFeatures.thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int}, destination_array)
+
+    # Run the kernel.
+    @cuda_sync threads=ArrayFeatures.thread_count ArrayFeatures.kernel(destination_pointer)
+
+    @test download(Int, destination_array, ArrayFeatures.thread_count) == fill(ArrayFeatures.manipulate_array(), ArrayFeatures.thread_count)
+end
+
+@cuda_benchmark "array features" array_features_benchmark()
diff --git a/gc-benchmarks/array-reduction.jl b/gc-benchmarks/array-reduction.jl
new file mode 100644
index 00000000..b4747de3
--- /dev/null
+++ b/gc-benchmarks/array-reduction.jl
@@ -0,0 +1,43 @@
+module ArrayReduction
+
+using CUDAdrv, CUDAnative
+
+# This benchmark approximates pi by naively constructing an array comprehension
+# for the Madhava–Leibniz series and computing its sum. It does this a few times
+# to achieve a respectable run time.
+
+const thread_count = 256
+const series_length = 200
+const runs = 20
+
+function iterative_sum(elements::Array{T})::T where T
+    result = zero(T)
+    for i in elements
+        result += i
+    end
+    return result
+end
+
+function kernel(destination)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    unsafe_store!(destination, 0.0, i)
+    for _ in 1:runs
+        series = [CUDAnative.pow(-1 / 3.0, Float64(k)) / (2.0 * k + 1.0) for k in 0:series_length]
+        unsafe_store!(destination, unsafe_load(destination, i) + CUDAnative.sqrt(12.0) * iterative_sum(series), i)
+    end
+    return
+end
+
+end
+
+function array_reduction_benchmark()
+    destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float64) * ArrayReduction.thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array)
+
+    # Run the kernel.
+    @cuda_sync threads=ArrayReduction.thread_count ArrayReduction.kernel(destination_pointer)
+
+    @test download(Float64, destination_array, ArrayReduction.thread_count) ≈ ArrayReduction.runs .* fill(Float64(pi), ArrayReduction.thread_count)
+end
+
+@cuda_benchmark "array reduction" array_reduction_benchmark()
diff --git a/gc-benchmarks/arrays.jl b/gc-benchmarks/arrays.jl
new file mode 100644
index 00000000..1f247f6c
--- /dev/null
+++ b/gc-benchmarks/arrays.jl
@@ -0,0 +1,48 @@
+module Arrays
+
+using CUDAdrv, CUDAnative
+import ..CUDArandom: LinearCongruentialGenerator, next
+
+# This benchmark allocates a hierarchy of fairly modest Julia arrays.
+# Some arrays remain alive, others become unreachable. This benchmark
+# seeks to ascertain the performance of the allocator and garbage collector.
+
+const thread_count = 64
+const insertion_count = 80
+
+function insert(target::Array{Any, 1}, generator::LinearCongruentialGenerator)
+    while true
+        index = next(generator, 1, length(target))
+        elem = target[index]
+        if isa(elem, Array{Any, 1})
+            if length(elem) > 0
+                if next(generator, 0, 2) == 0
+                    target = elem
+                    continue
+                end
+            end
+        end
+
+        target[index] = Any[Any[] for _ in 1:5]
+        return
+    end
+end
+
+function kernel()
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    generator = LinearCongruentialGenerator(i)
+    toplevel = Any[Any[] for _ in 1:10]
+    for i in 1:insertion_count
+        insert(toplevel, generator)
+    end
+    return
+end
+
+end
+
+function arrays_benchmark()
+    # Run the kernel.
+    @cuda_sync threads=Arrays.thread_count Arrays.kernel()
+end
+
+@cuda_benchmark "arrays" arrays_benchmark()
diff --git a/gc-benchmarks/binary-tree.jl b/gc-benchmarks/binary-tree.jl
new file mode 100644
index 00000000..8341bb45
--- /dev/null
+++ b/gc-benchmarks/binary-tree.jl
@@ -0,0 +1,168 @@
+using Random, Test
+
+module BinaryTree
+
+using CUDAdrv, CUDAnative
+import Base: haskey, insert!
+
+# This benchmark defines a kernel that constructs a binary search
+# tree for a set of numbers and then proceeds to test membership
+# in that tree for a sequence of other numbers.
+#
+# The benchmark is designed to stress the allocator's ability to
+# allocate many small objects and garbage-collect the ones that
+# become dead after a while.
+
+"""A binary search tree node."""
+abstract type BinarySearchTreeNode{T} end
+
+"""An internal node of a binary search tree."""
+mutable struct InternalNode{T} <: BinarySearchTreeNode{T}
+    value::T
+    left::BinarySearchTreeNode{T}
+    right::BinarySearchTreeNode{T}
+end
+
+InternalNode{T}(value::T) where T = InternalNode{T}(value, LeafNode{T}(), LeafNode{T}())
+
+"""A leaf node of a binary search tree."""
+mutable struct LeafNode{T} <: BinarySearchTreeNode{T} end
+
+"""A binary search tree data structure."""
+mutable struct BinarySearchTree{T}
+    root::BinarySearchTreeNode{T}
+end
+
+"""Creates an empty binary search tree."""
+BinarySearchTree{T}() where T = BinarySearchTree{T}(LeafNode{T}())
+
+"""Tells if a binary search tree contains a particular element."""
+function haskey(tree::BinarySearchTree{T}, value::T)::Bool where T
+    walk = tree.root
+    while isa(walk, InternalNode{T})
+        if walk.value == value
+            return true
+        elseif walk.value > value
+            walk = walk.right
+        else
+            walk = walk.left
+        end
+    end
+    return false
+end
+
+"""Inserts an element into a binary search tree."""
+function insert!(tree::BinarySearchTree{T}, value::T) where T
+    if !isa(tree.root, InternalNode{T})
+        tree.root = InternalNode{T}(value)
+        return
+    end
+
+    walk = tree.root::InternalNode{T}
+    while true
+        if walk.value == value
+            return
+        elseif walk.value > value
+            right = walk.right
+            if isa(right, InternalNode{T})
+                walk = right
+            else
+                walk.right = InternalNode{T}(value)
+                return
+            end
+        else
+            left = walk.left
+            if isa(left, InternalNode{T})
+                walk = left
+            else
+                walk.left = InternalNode{T}(value)
+                return
+            end
+        end
+    end
+end
+
+"""
+Creates a binary search tree that contains elements copied from a device array.
+"""
+function BinarySearchTree{T}(elements::CUDAnative.DevicePtr{T}, size::Integer) where T
+    tree = BinarySearchTree{T}()
+    for i in 1:size
+        insert!(tree, unsafe_load(elements, i))
+    end
+    tree
+end
+
+"""
+Creates a binary search tree that contains elements copied from an array.
+"""
+function BinarySearchTree{T}(elements::Array{T}) where T
+    tree = BinarySearchTree{T}()
+    for i in 1:length(elements)
+        insert!(tree, elements[i])
+    end
+    tree
+end
+
+# Gets a sequence of Fibonacci numbers.
+function fibonacci(::Type{T}, count::Integer)::Array{T} where T
+    if count == 0
+        return []
+    elseif count == 1
+        return [one(T)]
+    end
+
+    results = [one(T), one(T)]
+    for i in 1:(count - 2)
+        push!(results, results[length(results) - 1] + results[length(results)])
+    end
+    return results
+end
+
+const number_count = 200
+const thread_count = 64
+const tests_per_thread = 2000
+
+# Define a kernel that copies values using a temporary buffer.
+function kernel(a::CUDAnative.DevicePtr{Int64}, b::CUDAnative.DevicePtr{Int64})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    tree = BinarySearchTree{Int64}(a, number_count)
+
+    for j in 1:tests_per_thread
+        offset = (i - 1) * tests_per_thread
+        index = offset + j
+        unsafe_store!(b, haskey(tree, unsafe_load(b, index)), index)
+    end
+
+    return
+end
+
+end
+
+function bintree_benchmark()
+    # Generate a sequence of 64-bit truncated Fibonacci numbers.
+    number_set = BinaryTree.fibonacci(Int64, BinaryTree.number_count)
+    # Randomize the sequence's order.
+    shuffle!(number_set)
+
+    # Generate numbers for which we will test membership in the sequence.
+    test_sequence = Array(1:(BinaryTree.thread_count * BinaryTree.tests_per_thread))
+
+    # Allocate two arrays.
+    source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * length(number_set))
+    destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * length(test_sequence))
+    source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+    # Fill the source and destination arrays.
+    upload!(source_array, number_set)
+    upload!(destination_array, test_sequence)
+
+    # Run the kernel.
+    @cuda_sync threads=BinaryTree.thread_count BinaryTree.kernel(source_pointer, destination_pointer)
+
+    @test download(Int64, destination_array, length(test_sequence)) == ([Int64(x in number_set) for x in test_sequence])
+end
+
+@cuda_benchmark "binary tree" bintree_benchmark()
diff --git a/gc-benchmarks/bitvector.jl b/gc-benchmarks/bitvector.jl
new file mode 100644
index 00000000..59892e92
--- /dev/null
+++ b/gc-benchmarks/bitvector.jl
@@ -0,0 +1,101 @@
+module Bitvector
+
+import Base: +, *, <<
+using CUDAnative
+
+# This benchmark performs naive arithmetic on bitvectors.
+# The goal of the benchmark is to gauge how GPU-unaware
+# standard library code that depends on arrays behaves when
+# used in a GPU kernel.
+
+const thread_count = 256
+
+@noinline function escape(value)
+    Base.pointer_from_objref(value)
+    value
+end
+
+mutable struct BitInteger{N}
+    bits::BitVector
+end
+
+function zero(::Type{BitInteger{N}})::BitInteger{N} where N
+    BitInteger{N}(falses(N))
+end
+
+function one(::Type{BitInteger{N}})::BitInteger{N} where N
+    result = falses(N)
+    result[1] = true
+    return BitInteger{N}(result)
+end
+
+function +(a::BitInteger{N}, b::BitInteger{N})::BitInteger{N} where N
+    carry = false
+    c = falses(N)
+    for i in 1:N
+        s = Int(a.bits[i]) + Int(b.bits[i]) + Int(carry)
+        if s == 1
+            carry = false
+            c[i] = true
+        elseif s == 2
+            carry = true
+        elseif s == 3
+            carry = true
+            c[i] = true
+        end
+    end
+    return BitInteger{N}(c)
+end
+
+function <<(a::BitInteger{N}, amount::Integer)::BitInteger{N} where N
+    c = falses(N)
+    for i in 1:(N - amount)
+        c[i + amount] = a.bits[i]
+    end
+    return BitInteger{N}(c)
+end
+
+function *(a::BitInteger{N}, b::BitInteger{N})::BitInteger{N} where N
+    c = zero(BitInteger{N})
+    for i in 1:N
+        if a.bits[i]
+            c += (b << (i - 1))
+        end
+    end
+    return c
+end
+
+function factorial(::Type{BitInteger{N}}, value::Integer)::BitInteger{N} where N
+    accumulator = one(BitInteger{N})
+    iv = one(BitInteger{N})
+    for i in 1:value
+        accumulator *= iv
+        iv += one(BitInteger{N})
+    end
+    return accumulator
+end
+
+function to_int(value::BitInteger{N})::Int where N
+    result = 0
+    for i in 1:N
+        if value.bits[i]
+            result += (1 << (i - 1))
+        end
+    end
+    return result
+end
+
+function kernel()
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    factorial(BitInteger{128}, 10)
+    return
+end
+
+end
+
+function bitvector_benchmark()
+    # Run the kernel.
+    @cuda_sync threads=Bitvector.thread_count Bitvector.kernel()
+end
+
+@cuda_benchmark "bitvector" bitvector_benchmark()
diff --git a/gc-benchmarks/genetic-algorithm.jl b/gc-benchmarks/genetic-algorithm.jl
new file mode 100644
index 00000000..06a83f74
--- /dev/null
+++ b/gc-benchmarks/genetic-algorithm.jl
@@ -0,0 +1,158 @@
+module GeneticAlgorithm
+
+# This benchmark runs a genetic algorithm on the GPU.
+# The population is stored in linked lists and characters
+# are stored in heap memory.
+
+using CUDAnative, CUDAdrv
+import ..LinkedList: List, Nil, Cons, foldl, map, max
+import ..CUDArandom: LinearCongruentialGenerator, next
+
+# A character in our genetic algorithm, based loosely on Fallout's SPECIAL system.
+mutable struct Character
+    strength::Int
+    perception::Int
+    endurance::Int
+    charisma::Int
+    intelligence::Int
+    agility::Int
+    luck::Int
+end
+
+# Computes the mean of two integers.
+function mean(a::Int, b::Int)::Int
+    div(a + b, 2)
+end
+
+function crossover(parent_one::Character, parent_two::Character)::Character
+    Character(
+        mean(parent_one.strength, parent_two.strength),
+        mean(parent_one.perception, parent_two.perception),
+        mean(parent_one.endurance, parent_two.endurance),
+        mean(parent_one.charisma, parent_two.charisma),
+        mean(parent_one.intelligence, parent_two.intelligence),
+        mean(parent_one.agility, parent_two.agility),
+        mean(parent_one.luck, parent_two.luck))
+end
+
+function mutate_stat(value::Int, generator::LinearCongruentialGenerator)::Int
+    new_stat = value + next(generator, -2, 3)
+    if new_stat > 10
+        return 10
+    elseif new_stat < 0
+        return 0
+    else
+        return new_stat
+    end
+end
+
+function mutate(original::Character, generator::LinearCongruentialGenerator)::Character
+    Character(
+        mutate_stat(original.strength, generator),
+        mutate_stat(original.perception, generator),
+        mutate_stat(original.endurance, generator),
+        mutate_stat(original.charisma, generator),
+        mutate_stat(original.intelligence, generator),
+        mutate_stat(original.agility, generator),
+        mutate_stat(original.luck, generator))
+end
+
+function random_character(generator::LinearCongruentialGenerator)::Character
+    Character(
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11),
+        next(generator, 0, 11))
+end
+
+# Computes the fitness of a character.
+function fitness(individual::Character)::Float64
+    # Compute the character's cost, i.e., the sum of their stats.
+    cost = Float64(individual.strength
+        + individual.perception
+        + individual.endurance
+        + individual.charisma
+        + individual.intelligence
+        + individual.agility
+        + individual.luck)
+
+    # Compute the character's true fitness, i.e., how well we expect
+    # the character to perform.
+    true_fitness = 0.0
+
+    function stat_fitness(stat::Int)::Float64
+        if stat >= 5
+            # Linear returns for stats greater than five.
+            return Float64(stat)
+        else
+            # Very low stats make for a poor character build.
+            return Float64(stat * stat) / 25.0
+        end
+    end
+
+    # Evaluate stats.
+    true_fitness += stat_fitness(individual.strength)
+    true_fitness += stat_fitness(individual.perception)
+    true_fitness += stat_fitness(individual.endurance)
+    true_fitness += stat_fitness(individual.charisma)
+    true_fitness += stat_fitness(individual.intelligence)
+    true_fitness += stat_fitness(individual.agility)
+    true_fitness += stat_fitness(individual.luck)
+
+    # We like charisma, intelligence and luck.
+    true_fitness += Float64(individual.charisma)
+    true_fitness += Float64(individual.intelligence)
+    true_fitness += Float64(individual.luck)
+
+    true_fitness - cost + 100.0
+end
+
+function fittest(population::List{Character})::Character
+    max(fitness, population, Character(0, 0, 0, 0, 0, 0, 0))
+end
+
+function step(population::List{Character}, generator::LinearCongruentialGenerator)::List{Character}
+    # Find the fittest individual in the population.
+    best = fittest(population)
+    # Do a bunch of crossovers and mutate the resulting population.
+    map(x -> mutate(crossover(best, x), generator), population)
+end
+
+function genetic_algo(seed::Int)::Character
+    generator = LinearCongruentialGenerator(seed)
+
+    # Generate some random characters.
+    individuals = Nil{Character}()
+    for j in 1:10
+        individuals = Cons{Character}(random_character(generator), individuals)
+    end
+
+    # Run the genetic algorithm for a few iterations.
+    for j in 1:2
+        individuals = step(individuals, generator)
+    end
+
+    # Find the best individual in the population.
+    fittest(individuals)
+end
+
+const thread_count = 256
+
+function kernel(results::CUDAnative.DevicePtr{Float64})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    fittest_individual = genetic_algo(i)
+    unsafe_store!(results, fitness(fittest_individual), i)
+end
+
+end
+
+function genetic_benchmark()
+    destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float64) * GeneticAlgorithm.thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array)
+    @cuda_sync threads=GeneticAlgorithm.thread_count GeneticAlgorithm.kernel(destination_pointer)
+end
+
+@cuda_benchmark "genetic algo" genetic_benchmark()
diff --git a/gc-benchmarks/linked-list.jl b/gc-benchmarks/linked-list.jl
new file mode 100644
index 00000000..5bc9b8ec
--- /dev/null
+++ b/gc-benchmarks/linked-list.jl
@@ -0,0 +1,119 @@
+module LinkedList
+
+using CUDAnative, CUDAdrv
+import Base: foldl, reduce, sum, max, map, reverse, filter
+
+# This benchmark constructs a linked list in a GPU kernel.
+# In doing so, it stresses the allocator's ability to quickly
+# allocate many small objects, as is common in idiomatic
+# object-oriented programs.
+# Thread divergence should be minimal in this benchmark.
+
+abstract type List{T}
+end
+
+mutable struct Nil{T} <: List{T}
+end
+
+mutable struct Cons{T} <: List{T}
+    value::T
+    next::List{T}
+end
+
+Cons{T}(value::T) where T = Cons{T}(value, Nil{T}())
+
+function List{T}(pointer, count::Integer) where T
+    result = Nil{T}()
+    for i in count:-1:1
+        result = Cons{T}(unsafe_load(pointer, i), result)
+    end
+    result
+end
+
+function foldl(op, list::List{T}; init) where T
+    node = list
+    accumulator = init
+    while isa(node, Cons{T})
+        accumulator = op(accumulator, node.value)
+        node = node.next
+    end
+    accumulator
+end
+
+function reduce(op, list::List{T}; init) where T
+    foldl(op, list; init=init)
+end
+
+function sum(list::List{T}) where T
+    reduce(+, list; init=zero(T))
+end
+
+function map_reverse(f::Function, list::List{T})::List{T} where T
+    foldl(list; init=Nil{T}()) do accumulator, value
+        Cons{T}(f(value), accumulator)
+    end
+end
+
+function reverse(list::List{T})::List{T} where T
+    map_reverse(x -> x, list)
+end
+
+function map(f::Function, list::List{T})::List{T} where T
+    reverse(map_reverse(f, list))
+end
+
+function max(evaluate::Function, list::List{T}, default_value::T)::T where T
+    foldl(list; init=default_value) do max_elem, elem
+        if evaluate(max_elem) < evaluate(elem)
+            elem
+        else
+            max_elem
+        end
+    end
+end
+
+function filter_reverse(f::Function, list::List{T})::List{T} where T
+    foldl(list; init=Nil{T}()) do accumulator, value
+        if f(value)
+            Cons{T}(value, accumulator)
+        else
+            accumulator
+        end
+    end
+end
+
+function filter(f::Function, list::List{T})::List{T} where T
+    reverse(filter_reverse(f, list))
+end
+
+const element_count = 1000
+const thread_count = 32
+
+function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    l = List{Int64}(elements, element_count)
+    unsafe_store!(results, sum(l), i)
+    return
+end
+
+end
+
+function linkedlist_benchmark()
+    # Allocate two arrays.
+    source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * LinkedList.element_count)
+    destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * LinkedList.thread_count)
+    source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+    # Fill the source and destination arrays.
+    upload!(source_array, Array(1:LinkedList.element_count))
+    upload!(destination_array, zeros(Int64, LinkedList.thread_count))
+
+    # Run the kernel.
+    @cuda_sync threads=LinkedList.thread_count LinkedList.kernel(source_pointer, destination_pointer)
+
+    # Verify the kernel's output.
+    @test download(Int64, destination_array, LinkedList.thread_count) == repeat([sum(1:LinkedList.element_count)], LinkedList.thread_count)
+end
+
+@cuda_benchmark "linked list" linkedlist_benchmark()
diff --git a/gc-benchmarks/matrix.jl b/gc-benchmarks/matrix.jl
new file mode 100644
index 00000000..5cb1cb57
--- /dev/null
+++ b/gc-benchmarks/matrix.jl
@@ -0,0 +1,45 @@
+module Matrix
+
+using StaticArrays, CUDAnative, CUDAdrv
+
+# This benchmark makes every thread allocate a large matrix.
+# It stresses the allocator's ability to quickly allocate
+# very large objects.
+
+const matrix_dim = 40
+const thread_count = 256
+
+@noinline function escape(value)
+    Base.pointer_from_objref(value)
+    value
+end
+
+function fill()
+    m = zeros(MMatrix{matrix_dim, matrix_dim, Int64})
+
+    for i in 1:matrix_dim
+        for j in 1:matrix_dim
+            m[i, j] = i * j
+        end
+    end
+
+    return escape(m)
+end
+
+function kernel(result::CUDAnative.DevicePtr{Int64})
+    thread_id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    for i in 1:6
+        unsafe_store!(result, fill()[20, 30], thread_id)
+    end
+    return
+end
+
+end
+
+function matrix_benchmark()
+    destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * Matrix.thread_count)
+    destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+    @cuda_sync threads=Matrix.thread_count Matrix.kernel(destination_pointer)
+end
+
+@cuda_benchmark "matrix" matrix_benchmark()
diff --git a/gc-benchmarks/run-all.jl b/gc-benchmarks/run-all.jl
new file mode 100644
index 00000000..359d80bc
--- /dev/null
+++ b/gc-benchmarks/run-all.jl
@@ -0,0 +1,113 @@
+using CUDAdrv, CUDAnative, Test, Statistics
+
+include("utils.jl")
+
+include("array-expansion.jl")
+include("array-features.jl")
+include("array-reduction.jl")
+include("arrays.jl")
+include("binary-tree.jl")
+include("bitvector.jl")
+include("linked-list.jl")
+include("matrix.jl")
+include("ssa-opt.jl")
+include("static-arrays.jl")
+include("stream-queries.jl")
+include("genetic-algorithm.jl")
+
+results = run_benchmarks()
+# Print the results to the terminal.
+println(results)
+
+gc_tags = [t for t in benchmark_tags if startswith(t, "gc")]
+
+# Also write them to a CSV for further analysis.
+open("strategies.csv", "w") do file
+    write(file, "benchmark,nogc,gc,gc-shared,bump,bump-pinned,nogc-ratio,gc-ratio,gc-shared-ratio,bump-ratio,bump-pinned-ratio\n")
+    all_results = []
+    function write_line(key, results)
+        if length(all_results) == 0
+            all_results = [Float64[] for _ in results]
+        end
+        write(file, "$key,$(join(results, ','))\n")
+        for (l, val) in zip(all_results, results)
+            push!(l, val)
+        end
+    end
+
+    for key in sort(collect(keys(results)))
+        runs = results[key]
+        gc_time = runs["gc"] / 1e6
+        gc_shared_time = runs["gc-shared"] / 1e6
+        nogc_time = runs["nogc"] / 1e6
+        bump_time = runs["bump"] / 1e6
+        bump_pinned_time = runs["bump-pinned"] / 1e6
+        gc_ratio = gc_time / nogc_time
+        gc_shared_ratio = gc_shared_time / nogc_time
+        bump_ratio = bump_time / nogc_time
+        bump_pinned_ratio = bump_pinned_time / nogc_time
+        write_line(key, [nogc_time, gc_time, gc_shared_time, bump_time, bump_pinned_time, 1.0, gc_ratio, gc_shared_ratio, bump_ratio, bump_pinned_ratio])
+    end
+    write_line("mean", mean.(all_results))
+end
+
+open("gc-heap-sizes.csv", "w") do file
+    ratio_tags = [t * "-ratio" for t in gc_tags]
+    write(file, "benchmark,$(join(gc_tags, ',')),$(join(ratio_tags, ','))\n")
+    all_times = [[] for t in gc_tags]
+    all_normalized_times = [[] for t in gc_tags]
+    for key in sort(collect(keys(results)))
+        runs = results[key]
+        times = [runs[t] / 1e6 for t in gc_tags]
+        for (l, val) in zip(all_times, times)
+            push!(l, val)
+        end
+        normalized_times = [runs[t] / runs["gc"] for t in gc_tags]
+        for (l, val) in zip(all_normalized_times, normalized_times)
+            push!(l, val)
+        end
+        write(file, "$key,$(join(times, ',')),$(join(normalized_times, ','))\n")
+    end
+    write(file, "mean,$(join(map(mean, all_times), ',')),$(join(map(mean, all_normalized_times), ','))\n")
+end
+
+open("gc-heap-sizes-summary.csv", "w") do file
+    write(file, "heap,mean-opt,mean-shared\n")
+    shared = Dict()
+    sizes = Dict()
+    for tag in gc_tags
+        shared[tag] = false
+        sizes[tag] = 60.0
+        for part in split(tag, "-")
+            if endswith(part, "mb")
+                sizes[tag] = parse(Float64, part[1:end - 2])
+            elseif part == "shared"
+                shared[tag] = true
+            end
+        end
+    end
+
+    all_normalized_times = [[] for t in gc_tags]
+    for key in sort(collect(keys(results)))
+        runs = results[key]
+        normalized_times = [runs[t] / runs["gc"] for t in gc_tags]
+        for (l, val) in zip(all_normalized_times, normalized_times)
+            push!(l, val)
+        end
+    end
+
+    unique_sizes = sort(unique(values(sizes)))
+    data = zeros(Float64, (2, length(unique_sizes)))
+    for (tag, vals) in zip(gc_tags, all_normalized_times)
+        if shared[tag]
+            shared_index = 2
+        else
+            shared_index = 1
+        end
+        size_index = indexin(sizes[tag], unique_sizes)[1]
+        data[shared_index, size_index] = mean(vals)
+    end
+    for i in 1:length(unique_sizes)
+        write(file, "$(unique_sizes[i]),$(data[1, i]),$(data[2, i])\n")
+    end
+end
diff --git a/gc-benchmarks/run-breakdown.jl b/gc-benchmarks/run-breakdown.jl
new file mode 100644
index 00000000..1d1bd5b9
--- /dev/null
+++ b/gc-benchmarks/run-breakdown.jl
@@ -0,0 +1,108 @@
+using CUDAdrv, CUDAnative, Test, Statistics, JSON
+
+include("utils-common.jl")
+
+const benchmarks = Dict()
+global benchmark_results = Dict()
+global current_benchmark = nothing
+
+macro cuda_sync(args...)
+    esc(quote
+        local heap_size = 10 * MiB
+        local local_arena_initial_size = div(heap_size, 10)
+        local global_arena_initial_size = heap_size - 8 * local_arena_initial_size
+        local gc_config = GCConfiguration(
+            local_arena_count=8,
+            local_arena_initial_size=local_arena_initial_size,
+            global_arena_initial_size=global_arena_initial_size)
+        local result = CUDAnative.@cuda gc=true gc_config=gc_config $(args...)
+        push!(benchmark_results[current_benchmark], result)
+    end)
+end
+
+macro cuda_benchmark(name, ex)
+    esc(quote
+        benchmarks[$name] = (() -> $(ex))
+    end)
+end
+
+include("array-expansion.jl")
+include("array-features.jl")
+include("array-reduction.jl")
+include("arrays.jl")
+include("binary-tree.jl")
+include("bitvector.jl")
+include("linked-list.jl")
+include("matrix.jl")
+include("ssa-opt.jl")
+include("static-arrays.jl")
+include("stream-queries.jl")
+include("genetic-algorithm.jl")
+
+function run_benchmarks()
+    cache_dir = mkpath("gc-benchmarks/breakdown-cache")
+    global benchmark_results = Dict()
+    results = Dict()
+    for (k, v) in pairs(benchmarks)
+        println(k)
+        cache_path = "$cache_dir/$(replace(k, " " => "-")).json"
+        if isfile(cache_path)
+            results[k] = open(cache_path, "r") do file
+                JSON.parse(file)
+            end
+        else
+            # Perform a dry run to ensure that compilations are cached.
+            global current_benchmark = k
+            benchmark_results[k] = []
+            v()
+
+            # Run the benchmarks for real.
+            benchmark_results[k] = []
+            v()
+            while sum(map(x -> x.elapsed_time, benchmark_results[k])) < 90
+                v()
+            end
+
+            results[k] = [
+                Dict(
+                    "elapsed-time" => r.elapsed_time,
+                    "collection-count" => r.collection_count,
+                    "collection-poll-time" => r.collection_poll_time,
+                    "collection-time" => r.collection_time)
+                for (k, r) in pairs(benchmark_results[k])]
+
+            open(cache_path, "w") do file
+                JSON.print(file, results[k])
+            end
+        end
+    end
+    return results
+end
+
+results = run_benchmarks()
+# Write results to a CSV file for further analysis.
+open("breakdown.csv", "w") do file
+    write(file, "benchmark,collection-poll-ratio,collection-ratio,other-ratio\n")
+    all_results = []
+    function write_line(key, results)
+        if length(all_results) == 0
+            all_results = [Float64[] for _ in results]
+        end
+        write(file, "$key,$(join(results, ','))\n")
+        for (l, val) in zip(all_results, results)
+            push!(l, val)
+        end
+    end
+
+    for key in sort(collect(keys(results)))
+        runs = results[key]
+        total_time = mean(getindex.(runs, "elapsed-time"))
+        poll_time = mean(getindex.(runs, "collection-poll-time"))
+        collection_time = mean(getindex.(runs, "collection-time"))
+        poll_ratio = poll_time / total_time
+        collection_ratio = collection_time / total_time
+        other_ratio = 1.0 - poll_ratio - collection_ratio
+        write_line(key, [poll_time, collection_ratio, other_ratio])
+    end
+    write_line("mean", mean.(all_results))
+end
diff --git a/gc-benchmarks/ssa-opt.jl b/gc-benchmarks/ssa-opt.jl
new file mode 100644
index 00000000..a9a83acd
--- /dev/null
+++ b/gc-benchmarks/ssa-opt.jl
@@ -0,0 +1,100 @@
+# This benchmark defines a simple SSA IR, creates a basic
+# block on the GPU and applies the constant folding optimization
+# to it.
+
+module SSAOpt
+
+# A base type for SSA instructions.
+abstract type Instruction end
+
+# A base type for values or flow in an SSA basic block.
+abstract type ValueOrFlow end
+
+# A value in an SSA control-flow graph.
+mutable struct Value <: ValueOrFlow
+    # The instruction that computes the value.
+    instruction::Instruction
+
+    # The next value or control-flow instruction.
+    next::ValueOrFlow
+end
+
+# A base type for control-flow instructions in an SSA basic block.
+abstract type Flow <: ValueOrFlow end
+
+# A control-flow instruction that returns a value.
+mutable struct ReturnFlow <: Flow
+    # The value to return.
+    result::Value
+end
+
+# A control-flow instruction that represents undefined control flow.
+mutable struct UndefinedFlow <: Flow end
+
+# A basic block in an SSA control-flow graph.
+mutable struct BasicBlock
+    # The first value or flow instruction in the basic block.
+    head::ValueOrFlow
+end
+
+# An integer constant instruction.
+mutable struct IConst <: Instruction
+    value::Int
+end
+
+# An integer addition instruction.
+mutable struct IAdd <: Instruction
+    # The left value.
+    left::Value
+    # The right value.
+    right::Value
+end
+
+# Folds constants in a basic block.
+function fold_constants(block::BasicBlock)
+    value = block.head
+    while isa(value, Value)
+        insn = value.instruction
+        if isa(insn, IAdd)
+            left = insn.left.instruction
+            right = insn.right.instruction
+            if isa(left, IConst)
+                if isa(right, IConst)
+                    value.instruction = IConst(left.value + right.value)
+                end
+            end
+        end
+        value = value.next
+    end
+    block
+end
+
+# Creates a block that naively computes `sum(1:range_max)`.
+function create_range_sum_block(range_max)
+    head = accumulator = Value(IConst(0), UndefinedFlow())
+    for i in 1:range_max
+        constant = Value(IConst(i), UndefinedFlow())
+        accumulator.next = constant
+        accumulator = Value(IAdd(accumulator, constant), UndefinedFlow())
+        constant.next = accumulator
+    end
+    ret_flow = ReturnFlow(accumulator)
+    accumulator.next = ret_flow
+    BasicBlock(head)
+end
+
+const thread_count = 256
+
+function kernel()
+    block = create_range_sum_block(25)
+    fold_constants(block)
+    return
+end
+
+end
+
+function ssaopt_benchmark()
+    @cuda_sync threads=SSAOpt.thread_count SSAOpt.kernel()
+end
+
+@cuda_benchmark "ssa opt" ssaopt_benchmark()
diff --git a/gc-benchmarks/static-arrays.jl b/gc-benchmarks/static-arrays.jl
new file mode 100644
index 00000000..88fcfa43
--- /dev/null
+++ b/gc-benchmarks/static-arrays.jl
@@ -0,0 +1,53 @@
+module StaticArraysBench
+
+using CUDAdrv, CUDAnative, StaticArrays
+
+# This benchmark allocates a variety of differently-sized static arrays.
+# The point of this benchmark is to ascertain how well the GC handles
+# many differently-sized objects.
+
+const thread_count = 64
+
+@noinline function escape(value)
+    Base.pointer_from_objref(value)
+    value
+end
+
+macro new_array(T, size)
+    quote
+        escape(zeros(MArray{Tuple{$size}, $T}))
+    end
+end
+
+function kernel()
+    for i in 1:2
+        for j in 1:2
+            for k in 1:2
+                for l in 1:2
+                    @new_array(Int64, 4)
+                    @new_array(Int64, 8)
+                    @new_array(Int64, 16)
+                end
+                @new_array(Int64, 32)
+                @new_array(Int64, 64)
+                @new_array(Int64, 128)
+            end
+            @new_array(Int64, 256)
+            @new_array(Int64, 512)
+            @new_array(Int64, 1024)
+        end
+        @new_array(Int64, 2048)
+        @new_array(Int64, 4096)
+        @new_array(Int64, 8192)
+    end
+    return
+end
+
+end
+
+function static_arrays_benchmark()
+    # Run the kernel.
+    @cuda_sync threads=StaticArraysBench.thread_count StaticArraysBench.kernel()
+end
+
+@cuda_benchmark "static arrays" static_arrays_benchmark()
diff --git a/gc-benchmarks/stream-queries.jl b/gc-benchmarks/stream-queries.jl
new file mode 100644
index 00000000..e7d60953
--- /dev/null
+++ b/gc-benchmarks/stream-queries.jl
@@ -0,0 +1,31 @@
+module StreamQueries
+
+using CUDAnative, CUDAdrv
+import ..LinkedList: List, Nil, Cons, foldl, map, max, filter
+
+# This benchmark applies stream operators (map, max,filter) to purely
+# functional lists.
+
+const thread_count = 256
+const input_size = 100
+
+function kernel(input::CUDAnative.DevicePtr{Float64}, output::CUDAnative.DevicePtr{Float64})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    values = List{Float64}(input, input_size)
+    values = map(x -> x * x, values)
+    values = filter(x -> x < 10.0 && x >= 0.0, values)
+    unsafe_store!(output, max(x -> x, values, 0.0), i)
+end
+
+end
+
+function stream_benchmark()
+    source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float64) * StreamQueries.input_size)
+    upload!(source_array, rand(Float64, StreamQueries.input_size))
+    destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float64) * StreamQueries.thread_count)
+    source_pointer = Base.unsafe_convert(CuPtr{Float64}, source_array)
+    destination_pointer = Base.unsafe_convert(CuPtr{Float64}, destination_array)
+    @cuda_sync threads=StreamQueries.thread_count StreamQueries.kernel(source_pointer, destination_pointer)
+end
+
+@cuda_benchmark "stream queries" stream_benchmark()
diff --git a/gc-benchmarks/utils-common.jl b/gc-benchmarks/utils-common.jl
new file mode 100644
index 00000000..334ae3c3
--- /dev/null
+++ b/gc-benchmarks/utils-common.jl
@@ -0,0 +1,66 @@
+module CUDArandom
+
+# A linear congruential pseudo-random number generator.
+mutable struct LinearCongruentialGenerator
+    modulus::Int
+    a::Int
+    c::Int
+    state::Int
+end
+
+LinearCongruentialGenerator(seed::Int) = LinearCongruentialGenerator(1 << 32, 1664525, 1013904223, seed)
+
+# Requests a pseudo-random number.
+function next(generator::LinearCongruentialGenerator)::Int
+    generator.state = (generator.a * generator.state + generator.c) % generator.modulus
+    generator.state
+end
+
+# Requests a pseudo-random number that is at least as great as `lower`
+# and less than `upper`.
+function next(generator::LinearCongruentialGenerator, lower::Int, upper::Int)::Int
+    lower + next(generator) % (upper - lower)
+end
+
+end
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+const MiB = 1 << 20
+const CU_LIMIT_MALLOC_HEAP_SIZE = 0x02
+const BENCHMARK_HEAP_SIZE = 64 * MiB
+
+function set_malloc_heap_size(size::Integer)
+    CUDAdrv.@apicall(
+        :cuCtxSetLimit,
+        (Cint, Csize_t),
+        CU_LIMIT_MALLOC_HEAP_SIZE,
+        Csize_t(size))
+end
+
+"""
+    @sync ex
+Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly
+synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As
+such, this operation is preferred over implicit synchronization (e.g. when performing a
+memory copy) for high-performance applications.
+It is also useful for timing code that executes asynchronously.
+"""
+macro sync(ex)
+    # Copied from https://github.com/JuliaGPU/CuArrays.jl/blob/8e45a27f2b12796f47683340845f98f017865676/src/utils.jl#L68-L86
+    quote
+        local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING)
+        local ret = $(esc(ex))
+        CUDAdrv.record(e)
+        CUDAdrv.synchronize(e)
+        ret
+    end
+end
diff --git a/gc-benchmarks/utils.jl b/gc-benchmarks/utils.jl
new file mode 100644
index 00000000..4fe2b540
--- /dev/null
+++ b/gc-benchmarks/utils.jl
@@ -0,0 +1,121 @@
+import BenchmarkTools, JSON
+
+include("utils-common.jl")
+
+function get_gc_mode()
+    try
+        return gc_mode
+    catch ex
+        return "gc"
+    end
+end
+
+macro cuda_sync(args...)
+    esc(quote
+        local mode = get_gc_mode()
+        if mode == "gc"
+            CUDAnative.@cuda gc=true gc_config=gc_config $(args...)
+        elseif startswith(mode, "bump")
+            local capacity = 60 * MiB
+            if mode == "bump"
+                local buf = Mem.alloc(Mem.DeviceBuffer, capacity)
+            else
+                local buf = Mem.alloc(Mem.HostBuffer, capacity)
+            end
+            local start_address = pointer(buf)
+            local function init(kernel)
+                CUDAnative.Runtime.bump_alloc_init!(kernel, start_address, capacity)
+            end
+            @sync CUDAnative.@cuda init=init malloc="ptx_bump_alloc" $(args...)
+            Mem.free(buf)
+        else
+            @sync CUDAnative.@cuda $(args...)
+        end
+    end)
+end
+
+suites = Dict()
+
+function register_cuda_benchmark(f, name, config)
+    suites[name][config] = BenchmarkTools.@benchmarkable $f() setup=(set_malloc_heap_size(BENCHMARK_HEAP_SIZE); $f()) teardown=(device_reset!()) evals=1 seconds=90
+end
+
+benchmark_tags = [
+    "gc", "gc-shared",
+    "gc-45mb", "gc-shared-45mb",
+    "gc-30mb", "gc-shared-30mb",
+    "gc-15mb", "gc-shared-15mb",
+    "gc-10mb", "gc-shared-10mb",
+    "nogc", "bump", "bump-pinned"
+]
+
+macro cuda_benchmark(name, ex)
+    esc(quote
+        local suite = BenchmarkTools.BenchmarkGroup()
+        local function register_gc_shared(config, heap_size)
+            register_cuda_benchmark($name, config) do
+                global gc_mode = "gc"
+                global gc_config = GCConfiguration(local_arena_count=0, global_arena_initial_size=heap_size)
+                $(ex)
+            end
+        end
+        local function register_gc(config, heap_size)
+            register_cuda_benchmark($name, config) do
+                global gc_mode = "gc"
+                local local_arena_initial_size = div(heap_size, 10)
+                local global_arena_initial_size = heap_size - 8 * local_arena_initial_size
+                global gc_config = GCConfiguration(
+                    local_arena_count=8,
+                    local_arena_initial_size=local_arena_initial_size,
+                    global_arena_initial_size=global_arena_initial_size)
+                $(ex)
+            end
+        end
+
+        suites[$name] = BenchmarkTools.BenchmarkGroup(benchmark_tags)
+        register_gc("gc", 60 * MiB)
+        register_gc_shared("gc-shared", 60 * MiB)
+        register_gc("gc-45mb", 45 * MiB)
+        register_gc_shared("gc-shared-45mb", 45 * MiB)
+        register_gc("gc-30mb", 30 * MiB)
+        register_gc_shared("gc-shared-30mb", 30 * MiB)
+        register_gc("gc-15mb", 15 * MiB)
+        register_gc_shared("gc-shared-15mb", 15 * MiB)
+        register_gc("gc-10mb", 10 * MiB)
+        register_gc_shared("gc-shared-10mb", 10 * MiB)
+        register_cuda_benchmark($name, "nogc") do
+            global gc_mode = "nogc"
+            $(ex)
+        end
+        register_cuda_benchmark($name, "bump") do
+            global gc_mode = "bump"
+            $(ex)
+        end
+        register_cuda_benchmark($name, "bump-pinned") do
+            global gc_mode = "bump-pinned"
+            $(ex)
+        end
+    end)
+end
+
+function run_benchmarks()
+    cache_dir = mkpath("gc-benchmarks/results-cache")
+    results = Dict()
+    for (name, group) in pairs(suites)
+        cache_path = "$cache_dir/$(replace(name, " " => "-")).json"
+        if isfile(cache_path)
+            group_results = open(cache_path, "r") do file
+                JSON.parse(file)
+            end
+        else
+            runs = BenchmarkTools.run(group)
+            median_times = BenchmarkTools.median(runs)
+            group_results = Dict(k => r.time for (k, r) in pairs(median_times))
+            open(cache_path, "w") do file
+                JSON.print(file, group_results)
+            end
+        end
+        results[name] = group_results
+    end
+    return results
+end
diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index 8f97957b..653a4c17 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -29,12 +29,18 @@ include(joinpath("device", "pointer.jl"))
 include(joinpath("device", "array.jl"))
 include(joinpath("device", "cuda.jl"))
 include(joinpath("device", "llvm.jl"))
+include(joinpath("device", "threading.jl"))
+
+# The interrupts and GC files need to be loaded _before_ the
+# runtime intrinsics file, because some runtime intrinsics
+# depend on the GC and the GC depends on interrupts.
+include("interrupts.jl")
+include("gc.jl")
 include(joinpath("device", "runtime.jl"))
 
 include("compiler.jl")
 include("execution.jl")
 include("reflection.jl")
-
 include("deprecated.jl")
 
 include("init.jl")
diff --git a/src/compiler/common.jl b/src/compiler/common.jl
index 04c9f0a5..5604d617 100644
--- a/src/compiler/common.jl
+++ b/src/compiler/common.jl
@@ -12,12 +12,23 @@ struct CompilerJob
     maxthreads::Union{Nothing,CuDim}
     blocks_per_sm::Union{Nothing,Integer}
     maxregs::Union{Nothing,Integer}
+    # The name of the memory allocation function to use when allocating
+    # managed memory. A transform will rewrite all managed memory allocations
+    # to use this function instead. The 'malloc' signature must be
+    # 'void* malloc(size_t)' or compatible.
+    malloc::String
+    # Indicates whether the GPU GC or the "malloc never free"
+    # GC intrinsic lowering strategy is to be used. The former
+    # is used when this field is `true`; the latter when it is
+    # `false`.
+    gc::Bool
     name::Union{Nothing,String}
 
     CompilerJob(f, tt, cap, kernel; name=nothing,
                     minthreads=nothing, maxthreads=nothing,
-                    blocks_per_sm=nothing, maxregs=nothing) =
-        new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name)
+                    blocks_per_sm=nothing, maxregs=nothing,
+                    malloc="malloc",gc=false) =
+        new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, malloc, gc, name)
 end
 
 # global job reference
diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index ce3d7382..035238af 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -51,7 +51,7 @@ end
 
 function codegen(target::Symbol, job::CompilerJob;
                  libraries::Bool=true, dynamic_parallelism::Bool=true, optimize::Bool=true,
-                 strip::Bool=false,strict::Bool=true)
+                 strip::Bool=false, strict::Bool=true, internalize::Bool=true)
     ## Julia IR
 
     @timeit to[] "validation" check_method(job)
@@ -91,12 +91,12 @@ function codegen(target::Symbol, job::CompilerJob;
     # always preload the runtime, and do so early; it cannot be part of any timing block
     # because it recurses into the compiler
     if libraries
-        runtime = load_runtime(job.cap)
+        runtime = load_runtime(job.cap, job.malloc)
         runtime_fns = LLVM.name.(defs(runtime))
     end
 
     @timeit to[] "LLVM middle-end" begin
-        ir, kernel = @timeit to[] "IR generation" irgen(job, method_instance, world)
+        ir, kernel = @timeit to[] "IR generation" irgen(job, method_instance, world; internalize=internalize)
 
         if libraries
             undefined_fns = LLVM.name.(decls(ir))
@@ -154,7 +154,7 @@ function codegen(target::Symbol, job::CompilerJob;
                 # cached compilation
                 dyn_kernel_fn = get!(cache, dyn_job) do
                     dyn_ir, dyn_kernel = codegen(:llvm, dyn_job;
-                                                 optimize=optimize, strip=strip,
+                                                 optimize=optimize, strip=strip, internalize=internalize,
                                                  dynamic_parallelism=false, strict=false)
                     dyn_kernel_fn = LLVM.name(dyn_kernel)
                     link!(ir, dyn_ir)
diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl
index 2e3bd510..4c4699d3 100644
--- a/src/compiler/irgen.jl
+++ b/src/compiler/irgen.jl
@@ -137,7 +137,7 @@ function compile_method_instance(job::CompilerJob, method_instance::Core.MethodI
     return llvmf, dependencies
 end
 
-function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world)
+function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world; internalize::Bool=true)
     entry, dependencies = @timeit to[] "emission" compile_method_instance(job, method_instance, world)
     mod = LLVM.parent(entry)
 
@@ -236,7 +236,26 @@ function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world)
         current_job = job
 
         linkage!(entry, LLVM.API.LLVMExternalLinkage)
-        internalize!(pm, [LLVM.name(entry)])
+        if internalize
+            # We want to internalize functions so we can optimize
+            # them, but we don't really want to internalize globals
+            # because doing so may cause multiple copies of the same
+            # globals to appear after linking together modules.
+            #
+            # For example, the runtime library includes GC-related globals.
+            # It is imperative that these globals are shared by all modules,
+            # but if they are internalized before they are linked then
+            # they will actually not be internalized.
+            #
+            # Also, don't internalize the entry point, for obvious reasons.
+            non_internalizable_names = [LLVM.name(entry)]
+            for val in globals(mod)
+                if isa(val, LLVM.GlobalVariable)
+                    push!(non_internalizable_names, LLVM.name(val))
+                end
+            end
+            internalize!(pm, non_internalizable_names)
+        end
 
         add!(pm, ModulePass("LowerThrow", lower_throw!))
         add!(pm, FunctionPass("HideUnreachable", hide_unreachable!))
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 1e76f146..976b700a 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -19,14 +19,14 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
     #
     # NOTE: we need to use multiple distinct pass managers to force pass ordering;
     #       intrinsics should never get lowered before Julia has optimized them.
-    if VERSION < v"1.2.0-DEV.375"
+    if VERSION < v"1.3.0-DEV.390"
         # with older versions of Julia, intrinsics are lowered unconditionally so we need to
         # replace them with GPU-compatible counterparts before anything else. that breaks
         # certain optimizations though: https://github.com/JuliaGPU/CUDAnative.jl/issues/340
 
         ModulePassManager() do pm
             initialize!(pm)
-            add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
+            add!(pm, FunctionPass("LowerGCFrame", eager_lower_gc_frame!))
             aggressive_dce!(pm) # remove dead uses of ptls
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
             run!(pm, mod)
@@ -45,24 +45,27 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
             ccall(:jl_add_optimization_passes, Cvoid,
                   (LLVM.API.LLVMPassManagerRef, Cint, Cint),
                   LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0)
+            ccall(:LLVMExtraAddLateLowerGCFramePass, Cvoid, (LLVM.API.LLVMPassManagerRef,), LLVM.ref(pm))
             run!(pm, mod)
         end
 
         ModulePassManager() do pm
             initialize!(pm)
+            if job.gc
+                add!(pm, FunctionPass("InsertSafepointsGPUGC", fun -> insert_safepoints_gpugc!(fun, entry)))
+                add!(pm, ModulePass("FinalLowerGPUGC", lower_final_gc_intrinsics_gpugc!))
+                add!(pm, FunctionPass("LowerArraysGPUGC", lower_array_calls_gc!))
+            else
+                add!(pm, ModulePass("FinalLowerNoGC", lower_final_gc_intrinsics_nogc!))
+                add!(pm, FunctionPass("LowerArraysNoGC", lower_array_calls_nogc!))
+            end
 
-            # lower intrinsics
-            add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
             aggressive_dce!(pm) # remove dead uses of ptls
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
-            # the Julia GC lowering pass also has some clean-up that is required
-            if VERSION >= v"1.2.0-DEV.531"
-                late_lower_gc_frame!(pm)
-            end
-
             run!(pm, mod)
         end
+        replace_malloc!(mod, job.malloc)
     end
 
     # PTX-specific optimizations
@@ -296,6 +299,29 @@ function fixup_metadata!(f::LLVM.Function)
     end
 end
 
+# Visits all calls to a particular intrinsic in a given LLVM module.
+function visit_calls_to(visit_call::Function, name::AbstractString, mod::LLVM.Module)
+    if haskey(functions(mod), name)
+        func = functions(mod)[name]
+
+        for use in uses(func)
+            call = user(use)::LLVM.CallInst
+            visit_call(call, func)
+        end
+    end
+end
+
+# Deletes all calls to a particular intrinsic in a given LLVM module.
+# Returns a Boolean that tells if any calls were actually deleted.
+function delete_calls_to!(name::AbstractString, mod::LLVM.Module)::Bool
+    changed = false
+    visit_calls_to(name, mod) do call, _
+        unsafe_delete!(LLVM.parent(call), call)
+        changed = true
+    end
+    return changed
+end
+
 # lower object allocations to to PTX malloc
 #
 # this is a PoC implementation that is very simple: allocate, and never free. it also runs
@@ -304,7 +330,7 @@ end
 # is currently very architecture/CPU specific: hard-coded pool sizes, TLS references, etc.
 # such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit
 # lower-level intrinsics which then can be lowered to architecture-specific code.
-function lower_gc_frame!(fun::LLVM.Function)
+function eager_lower_gc_frame!(fun::LLVM.Function)
     job = current_job::CompilerJob
     mod = LLVM.parent(fun)
     changed = false
@@ -351,10 +377,729 @@ function lower_gc_frame!(fun::LLVM.Function)
 
         @compiler_assert isempty(uses(barrier)) job
     end
+end
+
+# Visits all calls to a particular intrinsic in a given LLVM module
+# and redirects those calls to a different function.
+# Returns a Boolean that tells if any calls were actually redirected.
+function redirect_calls_to!(from::AbstractString, to, mod::LLVM.Module)::Bool
+    changed = false
+    visit_calls_to(from, mod) do call, _
+        args = collect(operands(call))[1:end - 1]
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            new_call = call!(builder, to, args)
+            replace_uses!(call, new_call)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+        changed = true
+    end
+    return changed
+end
+
+# Lowers the GC intrinsics produced by the LateLowerGCFrame pass to
+# use the "malloc, never free" strategy. These intrinsics are the
+# last point at which we can intervene in the pipeline before the
+# passes that deal with them become CPU-specific.
+function lower_final_gc_intrinsics_nogc!(mod::LLVM.Module)
+    changed = false
+
+    # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates
+    # store for an object, including headroom, but does not set the object's
+    # tag.
+    visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes
+        gc_alloc_bytes_ft = eltype(llvmtype(gc_alloc_bytes))::LLVM.FunctionType
+        T_ret = return_type(gc_alloc_bytes_ft)::LLVM.PointerType
+        T_bitcast = LLVM.PointerType(T_ret, LLVM.addrspace(T_ret))
+
+        # Decode the call.
+        ops = collect(operands(call))
+        size = ops[2]
+
+        # We need to reserve a single pointer of headroom for the tag.
+        # (LateLowerGCFrame depends on us doing that.)
+        headroom = Runtime.tag_size
+
+        # Call the allocation function and bump the resulting pointer
+        # so the headroom sits just in front of the returned pointer.
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext()))
+            ptr = call!(builder, Runtime.get(:gc_pool_alloc), [total_size])
+            cast_ptr = bitcast!(builder, ptr, T_bitcast)
+            bumped_ptr = gep!(builder, cast_ptr, [ConstantInt(Int32(1), JuliaContext())])
+            result_ptr = bitcast!(builder, bumped_ptr, T_ret)
+            replace_uses!(call, result_ptr)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame.
+    # We'll lower it as an alloca and hope SSA construction and DCE passes
+    # get rid of the alloca. This is a reasonable thing to hope for because
+    # all intrinsics that may cause the GC frame to escape will be replaced by
+    # nops.
+    visit_calls_to("julia.new_gc_frame", mod) do call, new_gc_frame
+        new_gc_frame_ft = eltype(llvmtype(new_gc_frame))::LLVM.FunctionType
+        T_ret = return_type(new_gc_frame_ft)::LLVM.PointerType
+        T_alloca = eltype(T_ret)
+
+        # Decode the call.
+        ops = collect(operands(call))
+        size = ops[1]
+
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            ptr = array_alloca!(builder, T_alloca, size)
+            replace_uses!(call, ptr)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # The 'julia.get_gc_frame_slot' is closely related to the previous
+    # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of
+    # a slot in the GC frame. We can simply turn this intrinsic into a GEP.
+    visit_calls_to("julia.get_gc_frame_slot", mod) do call, _
+        # Decode the call.
+        ops = collect(operands(call))
+        frame = ops[1]
+        offset = ops[2]
+
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            ptr = gep!(builder, frame, [offset])
+            replace_uses!(call, ptr)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # The 'julia.push_gc_frame' registers a GC frame with the GC. We
+    # don't have a GC, so we can just delete calls to this intrinsic!
+    changed |= delete_calls_to!("julia.push_gc_frame", mod)
+
+    # The 'julia.pop_gc_frame' unregisters a GC frame with the GC, so
+    # we can just delete calls to this intrinsic, too.
+    changed |= delete_calls_to!("julia.pop_gc_frame", mod)
+
+    # Ditto for 'julia.queue_gc_root'.
+    changed |= delete_calls_to!("julia.queue_gc_root", mod)
+
+    return changed
+end
+
+# Emits instructions that allocate a particular number of bytes
+# of GC-managed memory. No headroom is included. No tags are set.
+function new_bytes!(builder::LLVM.Builder, malloc, size)
+    call!(builder, malloc, [size])
+end
+
+# Emits instructions that allocate bytes for an object, including
+# headroom for the object's tag. Also fills in the object's tag if
+# one is provided.
+function new_object!(builder::LLVM.Builder, malloc, size, tag::Union{Type, Nothing} = nothing)
+    # We need to reserve a single pointer of headroom for the tag.
+    # (LateLowerGCFrame depends on us doing that.)
+    headroom = Runtime.tag_size
+
+    # Call the allocation function and bump the resulting pointer
+    # so the headroom sits just in front of the returned pointer.
+    total_size = add!(builder, size, ConstantInt(Int32(headroom), JuliaContext()))
+    obj_ptr = new_bytes!(builder, malloc, total_size)
+
+    jl_value_t = llvmtype(obj_ptr)
+    T_bitcast = LLVM.PointerType(jl_value_t, LLVM.addrspace(jl_value_t))
+
+    ptr = bitcast!(builder, obj_ptr, T_bitcast)
+    if tag != nothing
+        # Fill in the tag if we have one.
+        store!(
+            builder,
+            inttoptr!(
+                builder,
+                ConstantInt(
+                    convert(LLVMType, Int64),
+                    Int64(pointer_from_objref(tag))),
+                jl_value_t),
+            ptr)
+    end
+    bumped_ptr = gep!(builder, ptr, [ConstantInt(Int32(1), JuliaContext())])
+    return bitcast!(builder, bumped_ptr, jl_value_t)
+end
+
+"""
+lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
+
+An LLVM pass that lowers the GC intrinsics produced by the
+LateLowerGCFrame pass to use the GPU GC. These intrinsics are the
+last point at which we can intervene in the pipeline before the
+passes that deal with them become CPU-specific.
+"""
+function lower_final_gc_intrinsics_gpugc!(mod::LLVM.Module)
+    changed = false
+
+    # We'll start off with 'julia.gc_alloc_bytes'. This intrinsic allocates
+    # store for an object, including headroom, but does not set the object's
+    # tag.
+    visit_calls_to("julia.gc_alloc_bytes", mod) do call, gc_alloc_bytes
+        # Decode the call.
+        ops = collect(operands(call))
+        size = ops[2]
+
+        # We need to reserve a single pointer of headroom for the tag.
+        # (LateLowerGCFrame depends on us doing that.)
+        headroom = Runtime.tag_size
+
+        # Call the allocation function and bump the resulting pointer
+        # so the headroom sits just in front of the returned pointer.
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            result_ptr = new_object!(builder, Runtime.get(:gc_malloc_object), size)
+            replace_uses!(call, result_ptr)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # Next up: 'julia.new_gc_frame'. This intrinsic allocates a new GC frame.
+    # We actually have a call that implements this intrinsic. Let's use that.
+    changed |= redirect_calls_to!("julia.new_gc_frame", Runtime.get(:new_gc_frame), mod)
+
+    # The 'julia.get_gc_frame_slot' is closely related to the previous
+    # intrinisc. Specifically, 'julia.get_gc_frame_slot' gets the address of
+    # a slot in the GC frame. We can simply turn this intrinsic into a GEP.
+    visit_calls_to("julia.get_gc_frame_slot", mod) do call, _
+        # Decode the call.
+        ops = collect(operands(call))
+        frame = ops[1]
+        offset = ops[2]
+
+        let builder = Builder(JuliaContext())
+            position!(builder, call)
+            ptr = gep!(builder, frame, [offset])
+            replace_uses!(call, ptr)
+            unsafe_delete!(LLVM.parent(call), call)
+            dispose(builder)
+        end
+
+        changed = true
+    end
+
+    # The 'julia.push_gc_frame' registers a GC frame with the GC. We will
+    # call a function that does just this.
+    changed |= redirect_calls_to!("julia.push_gc_frame", Runtime.get(:push_gc_frame), mod)
+
+    # The 'julia.pop_gc_frame' unregisters a GC frame with the GC. We again
+    # have a function in the runtime library.
+    changed |= redirect_calls_to!("julia.pop_gc_frame", Runtime.get(:pop_gc_frame), mod)
+
+    # Delete calls to 'julia.queue_gc_root'.
+    changed |= delete_calls_to!("julia.queue_gc_root", mod)
 
     return changed
 end
 
+# Tells if a function manages a GC frame.
+function has_gc_frame(fun::LLVM.Function)
+    for insn in instructions(entry(fun))
+        if isa(insn, LLVM.CallInst)
+            callee = called_value(insn)
+            if isa(callee, LLVM.Function) && LLVM.name(callee) == "julia.new_gc_frame"
+                return true
+            end
+        end
+    end
+    return false
+end
+
+# Tells if an instruction is a call to a non-intrinsic callee.
+function is_non_intrinsic_call(instruction::LLVM.Instruction)
+    if isa(instruction, LLVM.CallInst)
+        callee = called_value(instruction)
+        if isa(callee, LLVM.Function)
+            callee_name = LLVM.name(callee)
+            return !startswith(callee_name, "julia.") && !startswith(callee_name, "llvm.")
+        else
+            return true
+        end
+    else
+        return false
+    end
+end
+
+"""
+    insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function)
+
+An LLVM pass that inserts GC safepoints in such a way that threads
+reach a safepoint after a reasonable amount of time.
+
+Moreover, this pass also inserts perma-safepoints after entry point returns.
+Perma-safepoints inform the GC that it doesn't need to wait for a warp to
+reach a safepoint; inserting them stops the GC from deadlocking.
+"""
+function insert_safepoints_gpugc!(fun::LLVM.Function, entry::LLVM.Function)
+    # Insert a safepoint before every function call, but only for
+    # functions that manage a GC frame.
+    #
+    # TODO: also insert safepoints on loop back-edges? This is what people
+    # usually do, but it requires nontrivial IR analyses that the LLVM C
+    # API doesn't expose.
+
+    if has_gc_frame(fun)
+        safepoint_function = Runtime.get(:gc_safepoint)
+        let builder = Builder(JuliaContext())
+            for block in blocks(fun)
+                for instruction in instructions(block)
+                    if is_non_intrinsic_call(instruction)
+                        if called_value(instruction) == safepoint_function
+                            continue
+                        end
+
+                        # Insert a safepoint just before the call.
+                        position!(builder, instruction)
+                        debuglocation!(builder, instruction)
+                        call!(builder, safepoint_function, LLVM.Value[])
+                    end
+                end
+            end
+            dispose(builder)
+        end
+    end
+
+    # Insert perma-safepoints if necessary.
+    if fun == entry
+        # Looks like we're going to have to insert perma-safepoints.
+        # We need to keep in mind that perma-safepoints are per-warp,
+        # so we absolutely cannot allow warps to be in a divergent
+        # state when a perma-safepoint is set---all bets are off if
+        # that happens anyway.
+        #
+        # To make sure that we don't end up in that situation,
+        # we will create a dedicated return block and replace all 'ret'
+        # instructions by jumps to that return block.
+
+        # Create the dedicated return block.
+        return_block = BasicBlock(fun, "kernel_exit")
+        let builder = Builder(JuliaContext())
+            position!(builder, return_block)
+            call!(builder, Runtime.get(:gc_perma_safepoint), LLVM.Value[])
+            ret!(builder)
+            dispose(builder)
+        end
+
+        # Rewrite return instructions as branches to the return bloc.
+        for block in blocks(fun)
+            if block == return_block
+                # We need to be careful not to trick ourselves into
+                # turning the return block's 'ret' into an infinite loop.
+                continue
+            end
+            term = terminator(block)
+            if isa(term, LLVM.RetInst)
+                unsafe_delete!(block, term)
+                let builder = Builder(JuliaContext())
+                    position!(builder, block)
+                    br!(builder, return_block)
+                    dispose(builder)
+                end
+            end
+        end
+    end
+    return true
+end
+
+# Tries to evaluate an LLVM IR constant as a literal pointer.
+function to_literal_pointer(value)::Tuple{Bool, Ptr{Cvoid}}
+    if !isa(value, LLVM.ConstantExpr)
+        return (false, C_NULL)
+    end
+
+    if !occursin("inttoptr", string(value))
+        return (false, C_NULL)
+    end
+
+    # Peel off addrspacecast and inttoptr.
+    ptr_arg = value
+    while occursin("addrspacecast", string(ptr_arg)) || occursin("inttoptr", string(ptr_arg))
+        ptr_arg = first(operands(ptr_arg))
+    end
+    ptr_val = convert(Int, ptr_arg)
+    (true, Ptr{Cvoid}(ptr_val))
+end
+
+# Visits all calls to literal pointers in a function.
+function visit_literal_pointer_calls(visit_call::Function, fun::LLVM.Function)
+    for block in blocks(fun)
+        for call in instructions(block)
+            if !isa(call, LLVM.CallInst)
+                continue
+            end
+
+            callee = called_value(call)
+            if !isa(callee, LLVM.ConstantExpr)
+                continue
+            end
+
+            # detect calls to literal pointers
+            # FIXME: can we detect these properly?
+            # FIXME: jl_apply_generic and jl_invoke also have such arguments
+            is_ptr, ptr = to_literal_pointer(callee)
+            if is_ptr
+                # look it up in the Julia JIT cache
+                frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0)
+                if length(frames) >= 1
+                    # @compiler_assert length(frames) == 1 job frames=frames
+                    fn, file, line, linfo, fromC, inlined, ip = last(frames)
+                    visit_call(call, fn)
+                end
+            end
+        end
+    end
+end
+
+# Emits instructions that create a new array. The array's element type
+# must be statically known. Its dimensions are represented as a tuple
+# of LLVM IR values. A pointer to the new array is returned.
+function new_array!(builder::LLVM.Builder, malloc, array_type::Type, dims::Tuple; data_ptr::Union{Nothing,LLVM.Value} = nothing)
+    # Since time immemorial, the structure of an array is (quoting from the
+    # Julia source code here):
+    #
+    #     typedef struct {
+    #       /*
+    #         how - allocation style
+    #         0 = data is inlined, or a foreign pointer we don't manage
+    #         1 = julia-allocated buffer that needs to be marked
+    #         2 = malloc-allocated pointer this array object manages
+    #         3 = has a pointer to the object that owns the data
+    #       */
+    #       uint16_t how:2;
+    #       uint16_t ndims:10;
+    #       uint16_t pooled:1;
+    #       uint16_t ptrarray:1;  // representation is pointer array
+    #       uint16_t isshared:1;  // data is shared by multiple Arrays
+    #       uint16_t isaligned:1; // data allocated with memalign
+    #     } jl_array_flags_t;
+    #
+    #     JL_EXTENSION typedef struct {
+    #       JL_DATA_TYPE
+    #       void *data;
+    #     #ifdef STORE_ARRAY_LEN
+    #       size_t length;
+    #     #endif
+    #       jl_array_flags_t flags;
+    #       uint16_t elsize;
+    #       uint32_t offset;  // for 1-d only. does not need to get big.
+    #       size_t nrows;
+    #       union {
+    #           // 1d
+    #           size_t maxsize;
+    #           // Nd
+    #           size_t ncols;
+    #       };
+    #       // other dim sizes go here for ndims > 2
+    #
+    #       // followed by alignment padding and inline data, or owner pointer
+    #     } jl_array_t;
+    #
+    # where `STORE_ARRAY_LEN` is a preprocessor directive that is technically a
+    # "configuration option." AFAICT, `STORE_ARRAY_LEN` is just always defined in
+    # practice.
+    #
+    # The Julia compiler is more than happy to eagerly generate code that accesses
+    # fields of this data structure directly, so we can't invent our own array data
+    # structure. Consequently, we will emit code here that carefully constructs
+    # an instance of `jl_array_t`.
+    #
+    # To keep things tidy, we'll construct an array (ironic, I know) that contains the
+    # values we'll assign to each field of the array. After that, we will generate
+    # code that fills in every field in one fell swoop.
+
+    fields = []
+
+    # Compute the size of the element type.
+    element_type = eltype(array_type)
+    llvm_element_type = convert(LLVMType, element_type, true)
+    mod = LLVM.parent(LLVM.parent(position(builder)))
+    layout = datalayout(mod)
+    element_size = Csize_t(sizeof(layout, llvm_element_type))
+
+    # Compute the number of elements in the array.
+    element_count = LLVM.ConstantInt(convert(LLVMType, Csize_t), 1)
+    for i in dims
+        element_count = mul!(builder, element_count, intcast!(builder, i, convert(LLVMType, Csize_t)))
+    end
+
+    # Compute the size of the array's elements in bytes.
+    data_bytesize = mul!(
+        builder,
+        LLVM.ConstantInt(convert(LLVMType, Csize_t), element_size),
+        element_count)
+
+    if element_size == Csize_t(1) && length(dims) == 1
+        # If we're allocating an array of bytes, we will throw in an extra
+        # byte at the end for compatibility with Julia's ABI.
+        data_bytesize = add!(builder, data_bytesize, LLVM.ConstantInt(convert(LLVMType, Csize_t), 1))
+    end
+
+    # Actually allocate the array's contents. We will just always
+    # use a separate buffer. Inline data storage is wasteful and
+    # harder to implement.
+    if data_ptr == nothing
+        data_ptr = new_bytes!(builder, malloc, data_bytesize)
+    end
+
+    # The pointer to the array's data is the first field of the struct.
+    push!(fields, data_ptr)
+
+    # The array's length (i.e., the product of its dimensions) is the
+    # second field of the `jl_array_t` struct.
+    push!(fields, element_count)
+
+    # Synthesize a constant that represents the array's flags.
+    flags = Int16(0)
+    # Set the 'how' field to one.
+    flags |= Int16(1)
+    # Set the 'nDims' field.
+    flags <<= 10
+    flags |= Int16(length(dims))
+    # Set the 'pooled' field to `false`.
+    flags <<= 1
+    flags |= Int16(false)
+    # Set the 'ptrarray' field.
+    flags <<= 1
+    flags |= Int16(isa(llvm_element_type, LLVM.PointerType))
+    # Set the 'isshared' field to `false`.
+    flags <<= 1
+    flags |= Int16(false)
+    # Set the 'isaligned' field to `true`.
+    flags <<= 1
+    flags |= Int16(true)
+    # Add the flags to the `jl_array_t` struct.
+    push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), flags))
+
+    # Set the 'elsize' field.
+    push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(element_size)))
+
+    # Set the 'offset' field to zero (the array is not a slice).
+    push!(fields, LLVM.ConstantInt(convert(LLVMType, Int16), Int16(0)))
+
+    if length(dims) == 1
+        # Set the 'nrows' field to the number of elements.
+        push!(fields, element_count)
+        # Ditto for the 'maxsize' field.
+        push!(fields, element_count)
+    else
+        # If we're creating a multi-dimensional array, then the
+        # process is slightly different.
+        for i in dims
+            push!(fields, intcast!(builder, i, convert(LLVMType, Csize_t)))
+        end
+    end
+
+    # Synthesize a struct type that neatly represents the data we want
+    # to store.
+    struct_type = LLVM.StructType([llvmtype(f) for f in fields])
+
+    # We now know exactly what data we want to store in each field of the
+    # array's control structure.
+    # All that's left is to actually allocate the array and write that data
+    # to the control structure.
+    obj_ptr = new_object!(
+        builder,
+        malloc,
+        ConstantInt(convert(LLVMType, Csize_t), sizeof(layout, struct_type)),
+        array_type)
+    struct_ptr = bitcast!(
+        builder,
+        addrspacecast!(
+            builder,
+            obj_ptr,
+            LLVM.PointerType(eltype(llvmtype(obj_ptr)))),
+        LLVM.PointerType(struct_type))
+
+    for i in 1:length(fields)
+        val = fields[i]
+        gep = struct_gep!(builder, struct_ptr, i - 1)
+        store!(builder, val, gep)
+    end
+
+    return obj_ptr
+end
+
+# Generates code that extracts array dimensions from a tuple argument.
+function extract_array_dims!(builder, ::Type{Array{T, N}}, dims_tuple) where {T, N}
+    # First cast the tuple value to a size_t pointer in address space zero.
+    tuple_as_size_t = bitcast!(
+        builder,
+        addrspacecast!(
+            builder,
+            dims_tuple,
+            LLVM.PointerType(eltype(llvmtype(dims_tuple)))),
+        LLVM.PointerType(convert(LLVMType, Csize_t)))
+
+    is_literal, ptr = to_literal_pointer(tuple_as_size_t)
+
+    results = []
+    if is_literal
+        # If the tuple is implemented as a literal pointer, then we want to load its elements
+        # ahead of time; the device won't be able to access host-allocated constants.
+        for i in 1:N
+            value = Base.unsafe_load(Base.unsafe_convert(Ptr{Csize_t}, ptr), i)
+            push!(results, LLVM.ConstantInt(convert(LLVMType, Csize_t), value))
+        end
+    else
+        # Otherwise, generate code that loads fields from the tuple.
+        for i in 1:N
+            address = gep!(
+                builder,
+                tuple_as_size_t,
+                [LLVM.ConstantInt(convert(LLVMType, Int32), i - 1)])
+
+            push!(results, load!(builder, address))
+        end
+    end
+    return Tuple(results)
+end
+
+# Lowers function calls that pertain to array operations.
+function lower_array_calls!(fun::LLVM.Function, malloc)
+    changed_any = false
+    alloc_methods = [
+        :jl_alloc_array_1d,
+        :jl_alloc_array_2d,
+        :jl_alloc_array_3d,
+        :jl_new_array
+    ]
+    wrap_methods = [
+        :jl_ptr_to_array,
+        :jl_ptr_to_array_1d
+    ]
+    runtime_methods = [
+        :jl_array_grow_at,
+        :jl_array_grow_beg,
+        :jl_array_grow_end,
+        :jl_array_del_at,
+        :jl_array_del_beg,
+        :jl_array_del_end,
+        :jl_array_sizehint
+    ]
+    visit_literal_pointer_calls(fun) do call, name
+        args = collect(operands(call))[1:end - 1]
+        if name in alloc_methods
+            is_ptr, array_type_ptr = to_literal_pointer(args[1])
+            if is_ptr
+                # We can lower array creation calls if we know the type
+                # of the array to create in advance.
+                array_type = unsafe_pointer_to_objref(array_type_ptr)
+                let builder = Builder(JuliaContext())
+                    position!(builder, call)
+                    if name == :jl_new_array
+                        # jl_new_array requires special treatment. All the other ones are
+                        # pretty simple to handle.
+                        dim_args = extract_array_dims!(builder, array_type, args[2])
+                    else
+                        dim_args = Tuple(args[2:end])
+                    end
+                    new_array = new_array!(builder, malloc, array_type, dim_args)
+                    replace_uses!(call, new_array)
+                    unsafe_delete!(LLVM.parent(call), call)
+                    dispose(builder)
+                end
+                changed_any = true
+            end
+        elseif name in wrap_methods
+            is_ptr, array_type_ptr = to_literal_pointer(args[1])
+            if is_ptr
+                # We can lower array wrapping calls if we know the type
+                # of the array to create in advance.
+                array_type = unsafe_pointer_to_objref(array_type_ptr)
+                let builder = Builder(JuliaContext())
+                    position!(builder, call)
+                    if name == :jl_ptr_to_array
+                        dim_args = extract_array_dims!(builder, array_type, args[3])
+                    else
+                        dim_args = (args[3],)
+                    end
+                    new_array = new_array!(builder, malloc, array_type, dim_args; data_ptr=args[2])
+                    replace_uses!(call, new_array)
+                    unsafe_delete!(LLVM.parent(call), call)
+                    dispose(builder)
+                end
+                changed_any = true
+            end
+        elseif name in runtime_methods
+            let builder = Builder(JuliaContext())
+                position!(builder, call)
+                new_call = call!(builder, Runtime.get(name), args)
+                replace_uses!(call, new_call)
+                unsafe_delete!(LLVM.parent(call), call)
+                dispose(builder)
+            end
+            changed_any = true
+        end
+    end
+    return changed_any
+end
+
+function lower_array_calls_gc!(fun::LLVM.Function)
+    lower_array_calls!(fun, Runtime.get(:gc_malloc_object))
+end
+
+function lower_array_calls_nogc!(fun::LLVM.Function)
+    lower_array_calls!(fun, Runtime.get(:gc_pool_alloc))
+end
+
+# Replaces all uses of a function in a particular module with
+# a compatible function.
+function replace_function!(mod::LLVM.Module, old_name::String, new_name::String)
+    if new_name == old_name
+        # There's nothing to replace if the new function is the same as
+        # the old function.
+        return false
+    end
+
+    # Otherwise, we'll try and find the old function.
+    if !haskey(functions(mod), old_name)
+        # If the old function doesn't even appear in the module, then it's not in
+        # use and we can stop right here.
+        return false
+    end
+
+    old_function = functions(mod)[old_name]
+
+    if haskey(functions(mod), new_name)
+        new_function = functions(mod)[new_name]
+    else
+        # Create a new function.
+        new_function = LLVM.Function(
+            mod,
+            new_name,
+            eltype(llvmtype(old_function)::LLVM.PointerType)::LLVM.FunctionType)
+    end
+
+    # Replace all uses of the old function with the new function.
+    replace_uses!(old_function, new_function)
+
+    return true
+end
+
+# Replaces all uses of the managed memory allocation function in a
+# particular module with a compatible function with the specified name.
+function replace_malloc!(mod::LLVM.Module, malloc_name::String)
+    return replace_function!(mod, "julia.managed_malloc", malloc_name)
+end
+
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
 #
 # this assumes and checks that the TLS is unused, which should be the case for most GPU code
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
index 3d5f33ac..bfff7454 100644
--- a/src/compiler/rtlib.jl
+++ b/src/compiler/rtlib.jl
@@ -122,26 +122,30 @@ end
 
 ## functionality to build the runtime library
 
-function emit_function!(mod, cap, f, types, name)
+function emit_function!(mod, cap, f, types, name, malloc)
     tt = Base.to_tuple_type(types)
-    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false);
-                             libraries=false, strict=false)
+    # Optimize the module that defines the function, but don't
+    # internalize symbols in that function yet: internalizing
+    # globals may de-alias references to globals in the runtime
+    # library from equivalent references in the kernel.
+    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false; malloc=malloc);
+                             libraries=false, strict=false, internalize=false)
     LLVM.name!(entry, name)
     link!(mod, new_mod)
 end
 
-function build_runtime(cap)
+function build_runtime(cap, malloc)
     mod = LLVM.Module("CUDAnative run-time library", JuliaContext())
 
     for method in values(Runtime.methods)
-        emit_function!(mod, cap, method.def, method.types, method.llvm_name)
+        emit_function!(mod, cap, method.def, method.types, method.llvm_name, malloc)
     end
 
     mod
 end
 
-function load_runtime(cap)
-    name = "cudanative.$(cap.major)$(cap.minor).bc"
+function load_runtime(cap, malloc)
+    name = "cudanative.$(malloc).$(cap.major)$(cap.minor).bc"
     path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name)
     mkpath(dirname(path))
 
@@ -151,8 +155,8 @@ function load_runtime(cap)
                 parse(LLVM.Module, read(io), JuliaContext())
             end
         else
-            @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device, this might take a while..."
-            lib = build_runtime(cap)
+            @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device (allocating with '$malloc'), this might take a while..."
+            lib = build_runtime(cap, malloc)
             open(path, "w") do io
                 write(io, lib)
             end
diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl
index af629ac7..60bf4a7a 100644
--- a/src/compiler/validation.jl
+++ b/src/compiler/validation.jl
@@ -231,7 +231,7 @@ function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst)
         end
 
         # detect calls to undefined functions
-        if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns)
+        if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) && fn != job.malloc
             # figure out if the function lives in the Julia runtime library
             if libjulia[] == C_NULL
                 paths = filter(Libdl.dllist()) do path
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index b3addaf0..a54c629c 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -12,8 +12,9 @@ module Runtime
 using ..CUDAnative
 using LLVM
 using LLVM.Interop
+using CUDAdrv
 
-
+import ..CUDAnative: GCFrame
 ## representation of a runtime method instance
 
 struct RuntimeMethodInstance
@@ -127,8 +128,35 @@ function T_prjlvalue()
     LLVM.PointerType(eltype(T_pjlvalue), Tracked)
 end
 
+# A function that gets replaced by the proper 'malloc' implementation
+# for the context it executes in. When the GC is used, calls to this
+# function are replaced with 'gc_malloc'; otherwise, this function gets
+# rewritten as a call to the allocator, probably 'malloc'.
+@generated function managed_malloc(sz::Csize_t)
+    T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext()))
+    T_size = convert(LLVMType, Csize_t)
+    T_ptr = convert(LLVMType, Ptr{UInt8})
+
+    # create function
+    llvm_f, _ = create_function(T_ptr, [T_size])
+    mod = LLVM.parent(llvm_f)
+
+    intr = LLVM.Function(mod, "julia.managed_malloc", LLVM.FunctionType(T_pint8, [T_size]))
+
+    # generate IR
+    Builder(JuliaContext()) do builder
+        entry = BasicBlock(llvm_f, "entry", JuliaContext())
+        position!(builder, entry)
+        ptr = call!(builder, intr, [parameters(llvm_f)[1]])
+        jlptr = ptrtoint!(builder, ptr, T_ptr)
+        ret!(builder, jlptr)
+    end
+
+    call_function(llvm_f, Ptr{UInt8}, Tuple{Csize_t}, :((sz,)))
+end
+
 function gc_pool_alloc(sz::Csize_t)
-    ptr = malloc(sz)
+    ptr = managed_malloc(sz)
     if ptr == C_NULL
         @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
         throw(OutOfMemoryError())
@@ -138,7 +166,6 @@ end
 
 compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue)
 
-
 ## boxing and unboxing
 
 const tag_type = UInt
@@ -226,5 +253,357 @@ for (T, t) in [Int8   => :int8,  Int16  => :int16,  Int32  => :int32,  Int64  =>
     end
 end
 
+## Garbage collection
+
+# LLVM type of a pointer to a tracked pointer
+function T_pprjlvalue()
+    T_pjlvalue = convert(LLVMType, Any, true)
+    LLVM.PointerType(
+        LLVM.PointerType(eltype(T_pjlvalue), Tracked))
+end
+
+# Include GC memory allocation functions into the runtime.
+compile(CUDAnative.gc_malloc, Ptr{UInt8}, (Csize_t,))
+compile(CUDAnative.gc_malloc_object, Any, (Csize_t,), T_prjlvalue)
+
+# Include GC frame management functions into the runtime.
+compile(CUDAnative.new_gc_frame, Any, (Cuint,), T_pprjlvalue)
+
+compile(
+    CUDAnative.push_gc_frame,
+    Nothing,
+    (GCFrame, Cuint),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_pprjlvalue(), convert(LLVMType, UInt32)])
+
+compile(
+    CUDAnative.pop_gc_frame,
+    Nothing,
+    (GCFrame,),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_pprjlvalue()])
+
+# Also import the safepoint and perma-safepoint functions.
+compile(CUDAnative.gc_safepoint, Cvoid, ())
+compile(CUDAnative.gc_perma_safepoint, Cvoid, ())
+
+## Bump allocator.
+
+# Allocates `bytesize` bytes of storage by bumping the global bump
+# allocator pointer.
+function bump_alloc(bytesize::Csize_t)::Ptr{UInt8}
+    ptr = CUDAnative.@cuda_global_ptr("bump_alloc_ptr", Csize_t)
+    chunk_address = CUDAnative.atomic_add!(ptr, bytesize)
+    end_ptr = unsafe_load(CUDAnative.@cuda_global_ptr("bump_alloc_end", Csize_t))
+    if chunk_address < end_ptr
+        return Ptr{UInt8}(chunk_address)
+    else
+        return C_NULL
+    end
+end
+
+compile(bump_alloc, Ptr{UInt8}, (Csize_t,))
+
+function maybe_set_global(kernel, name, value::T) where T
+    try
+        global_handle = CuGlobal{T}(kernel.mod, name)
+        set(global_handle, value)
+    catch exception
+        # The interrupt pointer may not have been declared (because it is unused).
+        # In that case, we should do nothing.
+        if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+            rethrow()
+        end
+    end
+end
+
+function bump_alloc_init!(kernel, buffer_start, buffer_size)
+    maybe_set_global(kernel, "bump_alloc_ptr", buffer_start)
+    maybe_set_global(kernel, "bump_alloc_end", buffer_start + buffer_size)
+end
+
+## Arrays
+
+# A data structure that carefully mirrors an in-memory array control
+# structure for Julia arrays, as laid out by the compiler.
+mutable struct Array1D
+    # This is the data layout for Julia arrays, which we adhere to here.
+    # 
+    #     JL_EXTENSION typedef struct {
+    #       JL_DATA_TYPE
+    #       void *data;
+    #     #ifdef STORE_ARRAY_LEN
+    #       size_t length;
+    #     #endif
+    #       jl_array_flags_t flags;
+    #       uint16_t elsize;
+    #       uint32_t offset;  // for 1-d only. does not need to get big.
+    #       size_t nrows;
+    #       union {
+    #           // 1d
+    #           size_t maxsize;
+    #           // Nd
+    #           size_t ncols;
+    #       };
+    #       // other dim sizes go here for ndims > 2
+    #
+    #       // followed by alignment padding and inline data, or owner pointer
+    #     } jl_array_t;
+
+    data::Ptr{UInt8}
+    length::Csize_t
+    flags::UInt16
+    elsize::UInt16
+    offset::UInt32
+    nrows::Csize_t
+    maxsize::Csize_t
+end
+
+function zero_fill!(ptr::Ptr{UInt8}, count::Integer)
+    for i in 1:count
+        unsafe_store!(ptr, UInt8(0), count)
+    end
+    return
+end
+
+function memmove!(dst::Ptr{UInt8}, src::Ptr{UInt8}, sz::Integer)
+    if dst < src
+        for i in 1:sz
+            unsafe_store!(dst, unsafe_load(src, i), i)
+        end
+    else
+        for i in sz:-1:1
+            unsafe_store!(dst, unsafe_load(src, i), i)
+        end
+    end
+    return
+end
+
+# Resize the buffer to a max size of `newlen`
+# The buffer can either be newly allocated or realloc'd, the return
+# value is true if a new buffer is allocated and false if it is realloc'd.
+# the caller needs to take care of moving the data from the old buffer
+# to the new one if necessary.
+# When this function returns, the `.data` pointer always points to
+# the **beginning** of the new buffer.
+function array_resize_buffer(a::Array1D, newlen::Csize_t)::Bool
+    elsz = Csize_t(a.elsize)
+    nbytes = newlen * elsz
+    oldnbytes = a.maxsize * elsz
+
+    if elsz == 1
+        nbytes += 1
+        oldnbytes += 1
+    end
+
+    # Allocate a new buffer. 'managed_malloc' will get replaced with
+    # the "right" allocation function for the environment in which this
+    # function is compiled. So if the GC is enabled, then 'managed_malloc'
+    # will actually call 'gc_malloc'; otherwise, it's probably going to
+    # be 'malloc'.
+    a.data = managed_malloc(nbytes)
+    zero_fill!(a.data + oldnbytes, nbytes - oldnbytes)
+    a.maxsize = newlen
+    return true
+end
+
+"""
+    jl_array_grow_at_impl(a, idx, inc, n)
+
+Grows one-dimensional array `a` containing `n` elements by `inc` elements at
+zero-based index `idx`.
+"""
+function jl_array_grow_at_impl(a::Array1D, idx::Csize_t, inc::Csize_t, n::Csize_t)
+    data = a.data
+    elsz = Csize_t(a.elsize)
+    reqmaxsize = a.offset + n + inc
+    has_gap = n > idx
+    nb1 = idx * elsz
+    nbinc = inc * elsz
+    if reqmaxsize > a.maxsize
+        if reqmaxsize < 4
+            newmaxsize = Csize_t(4)
+        elseif reqmaxsize >= a.maxsize * 2
+            newmaxsize = reqmaxsize
+        else
+            newmaxsize = a.maxsize * 2
+        end
+
+        newbuf = array_resize_buffer(a, newmaxsize)
+        newdata = a.data + a.offset * elsz
+        if newbuf
+            memmove!(newdata, data, nb1)
+            if has_gap
+                memmove!(newdata + nb1 + nbinc, data + nb1, n * elsz - nb1)
+            end
+        elseif has_gap
+            memmove!(newdata + nb1 + nbinc, newdata + nb1, n * elsz - nb1)
+        end
+        a.data = data = newdata
+    elseif has_gap
+        memmove!(data + nb1 + nbinc, data + nb1, n * elsz - nb1)
+    end
+
+    newnrows = n + inc
+    a.length = newnrows
+    a.nrows = newnrows
+    zero_fill!(data + nb1, nbinc)
+    return
+end
+
+"""
+    jl_array_grow_at(a, idx, inc)
+
+Grows one-dimensional array `a` by `inc` elements at zero-based index `idx`.
+"""
+function jl_array_grow_at(a::Array1D, idx::Cssize_t, inc::Csize_t)
+    jl_array_grow_at_impl(a, Csize_t(idx), inc, a.nrows)
+    return
+end
+
+compile(
+    jl_array_grow_at,
+    Cvoid,
+    (Array1D, Cssize_t, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Cssize_t), convert(LLVMType, Csize_t)])
+
+"""
+    jl_array_grow_end(a, inc)
+
+Grows one-dimensional array `a` by `inc` elements at the end.
+"""
+function jl_array_grow_end(a::Array1D, inc::Csize_t)
+    n = a.nrows
+    jl_array_grow_at_impl(a, n, inc, n)
+    return
+end
+
+compile(
+    jl_array_grow_end,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
+"""
+    jl_array_grow_beg(a, inc)
+
+Grows one-dimensional array `a` by `inc` elements at the beginning of the array.
+"""
+function jl_array_grow_beg(a::Array1D, inc::Csize_t)
+    jl_array_grow_at_impl(a, Csize_t(0), inc, a.nrows)
+    return
+end
+
+compile(
+    jl_array_grow_beg,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
+"""
+    jl_array_sizehint(a, sz)
+
+Suggest that one-dimensional array `a` reserve capacity for at least `sz` elements.
+"""
+function jl_array_sizehint(a::Array1D, sz::Csize_t)
+    n = a.length
+    data = a.data
+    elsz = Csize_t(a.elsize)
+    reqmaxsize = a.offset + sz
+    if reqmaxsize > a.maxsize
+        newbuf = array_resize_buffer(a, reqmaxsize)
+        newdata = a.data + a.offset * elsz
+        if newbuf
+            memmove!(newdata, data, n * elsz)
+        end
+        a.data = data = newdata
+    end
+    return
+end
+
+compile(
+    jl_array_sizehint,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
+"""
+    jl_array_del_at_impl(a, idx, dec, n)
+
+Removes `dec` elements from one-dimensional array `a`, starting at zero-based index `idx`.
+`n` is the number of elements in `a`.
+"""
+function jl_array_del_at_impl(a::Array1D, idx::Csize_t, dec::Csize_t, n::Csize_t)
+    data = a.data
+    elsz = a.elsize
+    last = idx + dec
+    if n > last
+        memmove!(data + idx * elsz, data + last * elsz, (n - last) * elsz)
+    end
+    n -= dec
+    if elsz == 1
+        Base.unsafe_store!(data, n + 1, UInt8(0))
+    end
+    a.nrows = n
+    a.length = n
+    return
+end
+
+"""
+    jl_array_del_beg(a, dec)
+
+Removes `dec` elements from the beginning of one-dimensional array `a`.
+"""
+function jl_array_del_beg(a::Array1D, dec::Csize_t)
+    jl_array_del_at_impl(a, Csize_t(0), dec, a.nrows)
+    return
+end
+
+compile(
+    jl_array_del_beg,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
+"""
+    jl_array_del_end(a, dec)
+
+Removes `dec` elements from the end of one-dimensional array `a`.
+"""
+function jl_array_del_end(a::Array1D, dec::Csize_t)
+    n = a.nrows
+    jl_array_del_at_impl(a, n, dec, n)
+    return
+end
+
+compile(
+    jl_array_del_end,
+    Cvoid,
+    (Array1D, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Csize_t)])
+
+
+"""
+    jl_array_del_at(a, idx, dec)
+
+Removes `dec` elements from one-dimensional array `a`, starting at zero-based index `idx`.
+"""
+function jl_array_del_at(a::Array1D, idx::Cssize_t, dec::Csize_t)
+    jl_array_del_at_impl(a, Csize_t(idx), dec, a.nrows)
+    return
+end
+
+compile(
+    jl_array_del_at,
+    Cvoid,
+    (Array1D, Cssize_t, Csize_t),
+    () -> convert(LLVMType, Cvoid),
+    () -> [T_prjlvalue(), convert(LLVMType, Cssize_t), convert(LLVMType, Csize_t)])
 
 end
diff --git a/src/device/threading.jl b/src/device/threading.jl
new file mode 100644
index 00000000..96e58f72
--- /dev/null
+++ b/src/device/threading.jl
@@ -0,0 +1,276 @@
+# This file implements threading primitives that work for CUDAnative kernels.
+
+export ReaderWriterLock, reader_locked, writer_locked, Mutex, try_lock, unlock
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function atomic_compare_exchange!(ptr::Ptr{T}, cmp::T, new::T)::T where T
+    ptr_type = convert(LLVMType, Ptr{T})
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        %result = cmpxchg volatile $lt* %ptr, $lt %1, $lt %2 acq_rel acquire
+        %rv = extractvalue { $lt, i1 } %result, 0
+        ret $lt %rv
+        """
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T, $T}, ptr, cmp, new))
+end
+
+@generated function atomic_rmw!(::Val{op}, lhs::Ptr{T}, rhs::T)::T where {op, T}
+    ptr_type = convert(LLVMType, Ptr{T})
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        %rv = atomicrmw volatile $(String(op)) $lt* %ptr, $lt %1 acq_rel
+        ret $lt %rv
+        """
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T}), $T}, lhs, rhs))
+end
+
+# Atomically adds a value to a variable pointed to by a pointer.
+# Returns the previous value stored in that variable.
+function atomic_add!(lhs::Ptr{T}, rhs::T)::T where T
+    atomic_rmw!(Val(:add), lhs, rhs)
+end
+
+# Atomically subtracts a value from a variable pointed to by a pointer.
+# Returns the previous value stored in that variable.
+function atomic_subtract!(lhs::Ptr{T}, rhs::T)::T where T
+    atomic_rmw!(Val(:sub), lhs, rhs)
+end
+
+# Atomically computes the logical or of a value and a variable pointed
+# to by a pointer. Returns the previous value stored in that variable.
+function atomic_or!(lhs::Ptr{T}, rhs::T)::T where T
+    atomic_rmw!(Val(:or), lhs, rhs)
+end
+
+# Atomically assigns a new value to a variable pointed to by a pointer.
+# Returns the previous value stored in that variable.
+function atomic_exchange!(lhs::Ptr{T}, rhs::T)::T where T
+    atomic_rmw!(Val(:xchg), lhs, rhs)
+end
+
+# Loads a value from a pointer.
+@generated function volatile_load(ptr::Ptr{T})::T where T
+    ptr_type = string(convert(LLVMType, Ptr{T}))
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        %rv = load volatile $lt, $lt* %ptr
+        ret $lt %rv
+        """
+    :(Core.Intrinsics.llvmcall($ir, $T, Tuple{$(Ptr{T})}, ptr))
+end
+
+# Stores a value at a particular address.
+@generated function volatile_store!(ptr::Ptr{T}, value::T) where T
+    ptr_type = string(convert(LLVMType, Ptr{T}))
+    lt = string(convert(LLVMType, T))
+    ir = """
+        %ptr = inttoptr $ptr_type %0 to $lt*
+        store volatile $lt %1, $lt* %ptr
+        ret void
+        """
+    :(Core.Intrinsics.llvmcall($ir, Cvoid, Tuple{$(Ptr{T}), $T}, ptr, value))
+end
+
+function unwrap_device_ptr(ptr::DevicePtr{T, A})::Ptr{T} where {T, A}
+    convert(Ptr{T}, convert(Csize_t, ptr))
+end
+
+const ReaderWriterLockState = Int64
+
+"""
+A reader-writer lock: a lock that supports concurrent access for
+read operations and exclusive access for write operations.
+"""
+struct ReaderWriterLock
+    # A pointer to the reader-writer lock's state. The state
+    # is a counter that can be in one of the following states:
+    #
+    #   * > 0: the lock is acquired by one or more readers.
+    #          The state counter describes the number of readers
+    #          that have acquired the lock.
+    #
+    #   * = 0: the lock is idle.
+    #
+    #   * < 0: the lock is acquired by a single writer.
+    #
+    state_ptr::Ptr{ReaderWriterLockState}
+end
+
+ReaderWriterLock(state_ptr::DevicePtr{ReaderWriterLockState}) =
+    ReaderWriterLock(unwrap_device_ptr(state_ptr))
+
+const max_rw_lock_readers = (1 << (sizeof(ReaderWriterLockState) * 8 - 1))
+
+# Serializes execution of a function within a warp, to combat thread
+# divergence-related deadlocks.
+function warp_serialized(func::Function)
+    # Get the current thread's ID.
+    thread_id = threadIdx().x - 1
+
+    # Get the size of a warp.
+    size = warpsize()
+
+    local result
+    i = 0
+    while i < size
+        if thread_id % size == i
+            result = func()
+        end
+        i += 1
+    end
+    return result
+end
+
+"""
+    reader_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true)
+
+Acquires a reader-writer lock in reader mode, runs `func` while the lock is
+acquired and releases the lock again.
+"""
+function reader_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true)
+    if !acquire_lock
+        return func()
+    end
+
+    while true
+        # Increment the reader count. If the lock is in write-acquired mode,
+        # then the lock will stay in that mode (unless the reader count is
+        # exceeded, but that is virtually impossible). Otherwise, the lock
+        # will end up in read-acquired mode.
+        previous_state = atomic_add!(lock.state_ptr, 1)
+
+        # If the lock was in the idle or read-acquired state, then
+        # it is now in read-acquired mode.
+        if previous_state >= 0
+            # Run the function.
+            result = func()
+            # Decrement the reader count to release the reader lock.
+            atomic_add!(lock.state_ptr, -1)
+            # We're done here.
+            return result
+        end
+
+        # Decrement the reader count and try again.
+        atomic_add!(lock.state_ptr, -1)
+    end
+end
+
+"""
+    writer_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true)
+
+Acquires a reader-writer lock in writer mode, runs `func` while the lock is
+acquired and releases the lock again.
+"""
+function writer_locked(func::Function, lock::ReaderWriterLock; acquire_lock=true)
+    if !acquire_lock
+        return func()
+    end
+
+    warp_serialized() do
+        # Try to move the lock from 'idle' to 'write-acquired'.
+        while atomic_compare_exchange!(lock.state_ptr, 0, -max_rw_lock_readers) != 0
+        end
+
+        # We acquired the lock. Run the function.
+        result = func()
+
+        # Release the lock by atomically adding `max_rw_lock_readers` to the
+        # lock's state. It's important that we use an atomic add instead of a
+        # simple store because a store might cause a race condition with `read_locked`
+        # that'll put us in a deadlock state.
+        atomic_add!(lock.state_ptr, max_rw_lock_readers)
+
+        # We're done here.
+        return result
+    end
+end
+
+# Gets the thread ID of the current thread.
+@inline function get_thread_id()
+    return (blockIdx().x - 1) * blockDim().x + threadIdx().x
+end
+
+# Gets the warp ID of the current thread.
+@inline function get_warp_id()
+    return div(get_thread_id() - 1, warpsize()) + 1
+end
+
+const MutexState = UInt32
+
+"""
+A mutex: a lock that guarantees mutual exclusion.
+"""
+struct Mutex
+    # This GPU mutex implementation is based on
+    # Lock-based Synchronization for GPU Architectures
+    # by Yunlong Xu et al.
+    state_ptr::Ptr{MutexState}
+end
+
+Mutex(state_ptr::DevicePtr{MutexState}) = 
+    Mutex(unwrap_device_ptr(state_ptr))
+
+"""
+    unlock(mutex::Mutex)
+
+Unlocks a mutex.
+"""
+function unlock(mutex::Mutex)
+    threadfence()
+    tid = get_thread_id()
+    atomic_compare_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1), UInt32(0))
+    return
+end
+
+"""
+    try_lock(mutex::Mutex)::Bool
+
+Tries to acquire a lock on a mutex. Returns `true`
+if a lock was acquired successfully; otherwise, `false`.
+"""
+function try_lock(mutex::Mutex)::Bool
+    tid = UInt32(get_thread_id())
+    wsize = warpsize()
+    threadbit = UInt32(1) << (tid % wsize)
+
+    mask = vote_ballot(true)
+
+    bitset = @cuStaticSharedMem(UInt32, 128)
+    bitset_ptr = unwrap_device_ptr(pointer(bitset)) + sizeof(UInt32) * div(threadIdx().x - 1, wsize)
+    unsafe_store!(bitset_ptr, UInt32(0))
+
+    lock = atomic_or!(mutex.state_ptr, UInt32(1))
+    if lock & UInt32(1) == UInt32(0)
+        # The lock is free.
+        atomic_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1))
+    else
+        pre_owner = lock >> 1
+        if pre_owner != tid
+            if div(lock, wsize << 1) == div(tid, wsize) && pre_owner > tid && (((mask >> (pre_owner % wsize)) & UInt32(1)) == UInt32(1))
+                atomic_or!(bitset_ptr, UInt32(1 << (pre_owner % wsize)))
+                atomic_exchange!(mutex.state_ptr, UInt32((tid << 1) + 1))
+                if (atomic_or!(mutex.state_ptr, UInt32(0)) >> 1) != tid
+                    # Stealing failed.
+                    atomic_or!(bitset_ptr, threadbit)
+                end
+            else
+                # Cannot steal.
+                atomic_or!(bitset_ptr, threadbit)
+            end
+        end
+    end
+
+    if (unsafe_load(bitset_ptr) & threadbit) == UInt32(0)
+        threadfence()
+        return true
+    else
+        atomic_compare_exchange!(mutex.state_ptr, (tid << 1) + UInt32(1), UInt32(0))
+        threadfence()
+        return false
+    end
+end
diff --git a/src/execution.jl b/src/execution.jl
index 1783669a..34fc449e 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -8,8 +8,8 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
 # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and
 # the code it generates, or the execution
 function split_kwargs(kwargs)
-    macro_kws    = [:dynamic]
-    compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name]
+    macro_kws    = [:dynamic, :init, :gc_config]
+    compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :malloc, :gc]
     call_kws     = [:cooperative, :blocks, :threads, :config, :shmem, :stream]
     macro_kwargs = []
     compiler_kwargs = []
@@ -90,6 +90,9 @@ performed, scheduling a kernel launch on the current CUDA context.
 
 Several keyword arguments are supported that influence the behavior of `@cuda`.
 - `dynamic`: use dynamic parallelism to launch device-side kernels
+- `gc`: set up a GC and use it to allocate memory; cannot be combined with `dynamic`
+- `gc_config`: the GC configuration to use if `gc=true`; see [`GCConfiguration`](@ref)
+- `malloc`: the name of the allocation function to use, if `gc` is not in use
 - arguments that influence kernel compilation: see [`cufunction`](@ref) and
   [`dynamic_cufunction`](@ref)
 - arguments that influence kernel launch: see [`CUDAnative.HostKernel`](@ref) and
@@ -104,6 +107,7 @@ kernel to determine the launch configuration. A host-side kernel launch is done
         kernel_args = cudaconvert.(args)
         kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
         kernel = cufunction(f, kernel_tt; compilation_kwargs)
+        prepare_kernel(kernel; environment_kwargs)
         kernel(kernel_args...; launch_kwargs)
     end
 
@@ -132,20 +136,15 @@ macro cuda(ex...)
     args = call.args[2:end]
 
     code = quote end
-    macro_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs)
+    env_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs)
     vars, var_exprs = assign_args!(code, args)
 
     # handle keyword arguments that influence the macro's behavior
-    dynamic = false
-    for kwarg in macro_kwargs
-        key,val = kwarg.args
-        if key == :dynamic
-            isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value"))
-            dynamic = val::Bool
-        else
-            throw(ArgumentError("Unsupported keyword argument '$key'"))
-        end
-    end
+    dynamic = get_kwarg_or_default(env_kwargs, :dynamic, false)
+    isa(dynamic, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant Boolean"))
+
+    gc = get_kwarg_or_default(compiler_kwargs, :gc, false)
+    isa(gc, Bool) || throw(ArgumentError("`gc` keyword argument to @cuda should be a constant Boolean"))
 
     if dynamic
         # FIXME: we could probably somehow support kwargs with constant values by either
@@ -153,14 +152,98 @@ macro cuda(ex...)
         #        IR when processing the dynamic parallelism marker
         isempty(compiler_kwargs) || error("@cuda dynamic parallelism does not support compiler keyword arguments")
 
+        # FIXME: update the GC to support dynamic parallelism somehow.
+        !gc || error("@cuda does not support both `gc=true` and `dynamic=true`")
+
         # dynamic, device-side kernel launch
         push!(code.args,
             quote
                 # we're in kernel land already, so no need to cudaconvert arguments
                 local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
                 local kernel = dynamic_cufunction($(esc(f)), kernel_tt)
+                prepare_kernel(kernel; $(map(esc, env_kwargs)...))
                 kernel($(var_exprs...); $(map(esc, call_kwargs)...))
              end)
+    elseif gc
+        # Find the stream on which the kernel is to be scheduled.
+        stream = get_kwarg_or_default(call_kwargs, :stream, CuDefaultStream())
+
+        # Get the total number of threads.
+        thread_count = get_kwarg_or_default(call_kwargs, :threads, 1)
+
+        # Get the GC configuration.
+        config = get_kwarg_or_default(env_kwargs, :gc_config, GCConfiguration())
+
+        # GC-enabled host-side launch.
+        push!(code.args,
+            quote
+                GC.@preserve $(vars...) begin
+                    # Define a trivial buffer that contains the interrupt state.
+                    local interrupt_buffer = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, sizeof(ready), CUDAdrv.Mem.HOSTALLOC_DEVICEMAP)
+                    local interrupt_pointer = Base.unsafe_convert(Ptr{UInt32}, interrupt_buffer)
+                    unsafe_store!(interrupt_pointer, ready)
+                    local device_interrupt_pointer = Base.unsafe_convert(CuPtr{UInt32}, interrupt_buffer)
+
+                    # Evaluate the GC configuration.
+                    local gc_config = $(esc(config))
+
+                    # Allocate a shared buffer for GC memory.
+                    local gc_memory_size = initial_heap_size(gc_config, prod($(esc(thread_count))))
+                    local gc_heap = GCHeapDescription()
+                    expand!(gc_heap, gc_memory_size)
+                    local master_record = gc_init!(gc_heap, gc_config, prod($(esc(thread_count))))
+
+                    # Define a kernel initialization function.
+                    local function kernel_init(kernel)
+                        # Set the interrupt state pointer.
+                        try
+                            global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
+                            set(global_handle, device_interrupt_pointer)
+                        catch exception
+                            # The interrupt pointer may not have been declared (because it is unused).
+                            # In that case, we should do nothing.
+                            if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+                                rethrow()
+                            end
+                        end
+
+                        # Set the GC master record.
+                        try
+                            global_handle = CuGlobal{GCMasterRecord}(kernel.mod, "gc_master_record")
+                            set(global_handle, master_record)
+                        catch exception
+                            # The GC info pointer may not have been declared (because it is unused).
+                            # In that case, we should do nothing.
+                            if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+                                rethrow()
+                            end
+                        end
+                    end
+
+                    local gc_report = GCReport()
+                    local function handle_interrupt()
+                        gc_collect_impl(master_record, gc_heap, gc_config, gc_report)
+                    end
+
+                    try
+                        # Standard kernel setup logic.
+                        local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
+                        local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
+                        local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; malloc="ptx_gc_malloc", $(map(esc, compiler_kwargs)...))
+                        CUDAnative.prepare_kernel(kernel; init=kernel_init, $(map(esc, env_kwargs)...))
+                        gc_report.elapsed_time = Base.@elapsed begin
+                            kernel(kernel_args...; $(map(esc, call_kwargs)...))
+
+                            # Handle interrupts.
+                            handle_interrupts(handle_interrupt, interrupt_pointer, $(esc(stream)))
+                        end
+                    finally
+                        CUDAdrv.Mem.free(interrupt_buffer)
+                        free!(gc_heap)
+                    end
+                    gc_report
+                end
+            end)
     else
         # regular, host-side kernel launch
         #
@@ -173,6 +256,7 @@ macro cuda(ex...)
                     local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
                     local kernel = cufunction($(esc(f)), kernel_tt;
                                               $(map(esc, compiler_kwargs)...))
+                    prepare_kernel(kernel; $(map(esc, env_kwargs)...))
                     kernel(kernel_args...; $(map(esc, call_kwargs)...))
                 end
              end)
@@ -447,9 +531,25 @@ end
     return ex
 end
 
+"""
+    prepare_kernel(kernel::Kernel{F,TT}; init::Function=nop_init_kernel)
+
+Prepares a kernel for execution by setting up an environment for that kernel.
+This function should be invoked just prior to running the kernel. Its
+functionality is included in [`@cuda`](@ref).
+
+The 'init' keyword argument is a function that takes a kernel as argument and
+sets up an environment for the kernel.
+"""
+function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel, kw...) where {F,TT}
+    # Just call the 'init' function for now.
+    init(kernel)
+end
 
 ## device-side API
 
+# There doesn't seem to be a way to access the documentation for the call-syntax,
+# so attach it to the type
 """
     dynamic_cufunction(f, tt=Tuple{})
 
@@ -503,3 +603,8 @@ function nearest_warpsize(dev::CuDevice, threads::Integer)
     ws = CUDAdrv.warpsize(dev)
     return threads + (ws - threads % ws) % ws
 end
+
+function nop_init_kernel(kernel::AbstractKernel{F,TT}) where {F,TT}
+    # Do nothing.
+    return
+end
\ No newline at end of file
diff --git a/src/gc.jl b/src/gc.jl
new file mode 100644
index 00000000..0564097b
--- /dev/null
+++ b/src/gc.jl
@@ -0,0 +1,1192 @@
+# This file contains a GC implementation for CUDAnative kernels.
+# The sections below contain some basic info on how the garbage
+# collector works.
+#
+# MEMORY ALLOCATION
+#
+# The GC's allocator uses free lists, i.e., the allocator maintains
+# a list of all blocks that have not been allocated. Additionally,
+# the allocator also maintains a list of all allocated blocks, so
+# the collector knows which blocks it can free.
+#
+# GARBAGE COLLECTION
+#
+# The garbage collector itself is a semi-conservative, non-moving,
+# mark-and-sweep, stop-the-world GC that runs on the host.
+# The device may trigger the GC via an interrupt.
+#
+# The GC is semi-conservative in the sense that its set of roots
+# is precise but objects are scanned in an imprecise way.
+#
+# After every garbage collection, the GC will compact free lists:
+# adjacent free list block will be merged and the free list will
+# be sorted based on block sizes to combat memory fragmentation.
+#
+# If a free list is deemed to be "starving" after a collection, i.e.,
+# its total amount of free bytes has dropped below some threshold,
+# then a fresh chunk of GC-managed memory is allocated and added to
+# the free list.
+#
+# SAFEPOINTS
+#
+# Every warp gets a flag that tells if that warp is in a safepoint.
+# When a collection is triggered, the collector waits for every warp
+# to reach a safepoint. The warps indicate that they have reached a
+# safepoint by setting the flag.
+#
+# MISCELLANEOUS
+#
+# Some miscellaneous GPU-related GC implementation details:
+#
+#   * GC memory is shared by the host and device.
+#   * Every thread gets a fixed region of memory for storing GC roots in.
+#   * When the device runs out of GC memory, it requests an interrupt
+#     to mark and sweep.
+
+export gc_malloc, gc_malloc_object, gc_collect, gc_safepoint, GCConfiguration
+
+import Base: length, show
+import Printf: @sprintf
+
+# A data structure that precedes every chunk of memory that has been
+# allocated or put into the free list.
+struct FreeListRecord
+    # The size of the memory region this allocation record precedes.
+    # This size does not include the allocation record itself.
+    size::Csize_t
+
+    # A pointer to the next allocation record in the list. If this
+    # allocation record is part of the free list, then this pointer
+    # points to the next free list entry; otherwise, it points to the
+    # next entry in the list of allocated blocks.
+    next::Ptr{FreeListRecord}
+end
+
+@generated function get_field_pointer_impl(base_pointer::Ptr{TBase}, ::Val{field_name}) where {TBase, field_name}
+    index = Base.fieldindex(TBase, field_name)
+    offset = Base.fieldoffset(TBase, index)
+    type = Core.fieldtype(TBase, index)
+    :(Base.unsafe_convert(Ptr{$type}, base_pointer + $(offset)))
+end
+
+# Gets a pointer to a particular field.
+macro get_field_pointer(base_pointer, field_name)
+    :(get_field_pointer_impl($(esc(base_pointer)), Val($field_name)))
+end
+
+# Gets a pointer to the first byte of data managed by an allocation record.
+function data_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8}
+    Base.unsafe_convert(Ptr{UInt8}, record) + sizeof(FreeListRecord)
+end
+
+# Takes a pointer to the first byte of data managed by an allocation record
+# and produces a pointer to the record itself.
+function record_pointer(data::Ptr{UInt8})::Ptr{FreeListRecord}
+    Base.unsafe_convert(Ptr{FreeListRecord}, record) - sizeof(FreeListRecord)
+end
+
+# Gets a pointer to the first byte of data no longer managed by an allocation record.
+function data_end_pointer(record::Ptr{FreeListRecord})::Ptr{UInt8}
+    data_pointer(record) + unsafe_load(@get_field_pointer(record, :size))
+end
+
+# A data structure that describes a single GC "arena", i.e.,
+# a section of the heap that is managed by the GC. Every arena
+# has its own free list and allocation list.
+struct FreeListArena
+    # The allocation lock for the arena.
+    lock_state::ReaderWriterLockState
+
+    # The head of the free list.
+    free_list_head::Ptr{FreeListRecord}
+
+    # The head of the allocation list.
+    allocation_list_head::Ptr{FreeListRecord}
+end
+
+# Gets a free list arena's lock.
+get_lock(arena::Ptr{FreeListArena}) = ReaderWriterLock(@get_field_pointer(arena, :lock_state))
+
+const gc_align = Csize_t(16)
+
+# Aligns a pointer to an alignment boundary.
+function align_downward(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T
+    address_int = Base.convert(Csize_t, address)
+    remainder = address_int % alignment
+    if remainder == Csize_t(0)
+        return address
+    else
+        return address + alignment - remainder
+    end
+end
+
+# Aligns a pointer to an alignment boundary.
+function align_upward(address::Ptr{T}, alignment::Csize_t = gc_align)::Ptr{T} where T
+    result = align_downward(address, alignment)
+    if result < address
+        result += alignment
+    end
+    result
+end
+
+# Aligns a pointer to an alignment boundary.
+function align_upward(offset::T, alignment::Csize_t = gc_align)::T where T <: Integer
+    convert(T, Csize_t(align_upward(convert(Ptr{UInt8}, Csize_t(offset)), alignment)))
+end
+
+# Gets the size of an aligned header, including padding to satisfy
+# alignment requirements.
+@generated function header_size(::Type{T}, ::Val{alignment} = Val(gc_align))::UInt32 where {T, alignment}
+    result = align_upward(UInt32(sizeof(T)), alignment)
+    :($result)
+end
+
+# A reference to a Julia object.
+const ObjectRef = Ptr{Nothing}
+
+# A GC frame is just a pointer to an array of Julia objects.
+const GCFrame = Ptr{ObjectRef}
+
+# The states a safepoint flag can have.
+@enum SafepointState::UInt32 begin
+    # Indicates that a warp is not in a safepoint.
+    not_in_safepoint = 0
+    # Indicates that a warp is in a safepoint. This
+    # flag will be reset to `not_in_safepoint` by the
+    # collector on the next collecotr.
+    in_safepoint = 1
+    # Indicates that a warp is in a perma-safepoint:
+    # the collector will not try to set this type
+    # of safepoint back to `not_in_safepoint`.
+    in_perma_safepoint = 2
+end
+
+const LocalArena = FreeListArena
+const GlobalArena = FreeListArena
+
+# A data structure that contains global GC info. This data
+# structure is designed to be immutable: it should not be changed
+# once the host has set it up.
+struct GCMasterRecord
+    # The number of warps.
+    warp_count::UInt32
+
+    # The number of threads.
+    thread_count::UInt32
+
+    # The maximum size of a GC root buffer, i.e., the maximum number
+    # of roots per thread.
+    root_buffer_capacity::UInt32
+
+    # The number of local arenas.
+    local_arena_count::UInt32
+
+    # A pointer to a list of local GC arena pointers.
+    local_arenas::Ptr{Ptr{LocalArena}}
+
+    # A pointer to the global GC arena.
+    global_arena::Ptr{GlobalArena}
+
+    # A pointer to a list of safepoint flags. Every warp has its
+    # own flag.
+    safepoint_flags::Ptr{SafepointState}
+
+    # A pointer to a list of root buffer pointers that point to the
+    # end of the root buffer for every thread.
+    root_buffer_fingers::Ptr{Ptr{ObjectRef}}
+
+    # A pointer to a list of buffers that can be used to store GC roots in.
+    # These root buffers are partitioned into GC frames later on.
+    root_buffers::Ptr{ObjectRef}
+end
+
+# Iterates through all arena pointers stored in a GC master record.
+@inline function iterate_arenas(fun::Function, master_record::GCMasterRecord)
+    for i in 1:master_record.local_arena_count
+        fun(unsafe_load(master_record.local_arenas, i))
+    end
+    fun(master_record.global_arena)
+end
+
+# Gets the global GC interrupt lock.
+@inline function get_interrupt_lock()::ReaderWriterLock
+    return ReaderWriterLock(@cuda_global_ptr("gc_interrupt_lock", ReaderWriterLockState))
+end
+
+# Runs a function in such a way that no collection phases will
+# run as long as the function is executing. Use with care: this
+# macro acquires the GC interrupt lock in reader mode, so careless
+# use may cause deadlocks.
+macro nocollect(func)
+    quote
+        local @inline function lock_callback()
+            $(esc(func))
+        end
+
+        reader_locked(lock_callback, get_interrupt_lock())
+    end
+end
+
+# Gets the GC master record.
+@inline function get_gc_master_record()::GCMasterRecord
+    return unsafe_load(@cuda_global_ptr("gc_master_record", GCMasterRecord))
+end
+
+# Gets a pointer to the local arena for this thread. This
+# pointer may be null if there are no local arenas.
+@inline function get_local_arena()::Ptr{LocalArena}
+    master_record = get_gc_master_record()
+    if master_record.local_arena_count == UInt32(0)
+        return Base.unsafe_convert(Ptr{LocalArena}, C_NULL)
+    else
+        return unsafe_load(
+            master_record.local_arenas,
+            ((get_warp_id() - 1) % master_record.local_arena_count) + 1)
+    end
+end
+
+"""
+    new_gc_frame(size::UInt32)::GCFrame
+
+Allocates a new GC frame.
+"""
+@inline function new_gc_frame(size::UInt32)::GCFrame
+    master_record = get_gc_master_record()
+    # Return the root buffer tip: that's where the new GC frame starts.
+    return unsafe_load(master_record.root_buffer_fingers, get_thread_id())
+end
+
+"""
+    push_gc_frame(gc_frame::GCFrame, size::UInt32)
+
+Registers a GC frame with the garbage collector.
+"""
+@inline function push_gc_frame(gc_frame::GCFrame, size::UInt32)
+    master_record = get_gc_master_record()
+
+    threadid = get_thread_id()
+    next_rootbuf_start = master_record.root_buffers + threadid * master_record.root_buffer_capacity * sizeof(Ptr{ObjectRef})
+    new_rootbuf_finger = gc_frame + size * sizeof(ObjectRef)
+
+    # Check that we have enough room to push the GC frame.
+    if new_rootbuf_finger >= next_rootbuf_start
+        @cuprintf("Root buffer overflow in thread %ld.\n", threadid)
+        return
+    end
+
+    # Update the root buffer tip.
+    unsafe_store!(
+        master_record.root_buffer_fingers,
+        new_rootbuf_finger,
+        threadid)
+    return
+end
+
+"""
+    pop_gc_frame(gc_frame::GCFrame)
+
+Deregisters a GC frame.
+"""
+@inline function pop_gc_frame(gc_frame::GCFrame)
+    master_record = get_gc_master_record()
+
+    # Update the root buffer tip.
+    unsafe_store!(
+        master_record.root_buffer_fingers,
+        gc_frame,
+        get_thread_id())
+    return
+end
+
+"""
+    gc_safepoint()
+
+Signals that this warp has reached a GC safepoint.
+"""
+function gc_safepoint()
+    wait_for_interrupt() do
+        gc_set_safepoint_flag(in_safepoint; overwrite = false)
+    end
+    return
+end
+
+"""
+    gc_perma_safepoint()
+
+Signals that this warp has reached a GC perma-safepoint:
+the GC doesn't need to wait for this warp to reach a safepoint
+before starting collections. Instead, the GC may assume that
+the warp is already in a safepoint.
+
+Be careful with this function: all bets are off when this
+function is used improperly. For a more controlled (but still
+super dangerous) way to use perma-safepoints, see the
+`@perma_safepoint` macro.
+"""
+function gc_perma_safepoint()
+    gc_set_safepoint_flag(in_perma_safepoint)
+    return
+end
+
+# Sets this warp's safepoint flag to a particular state.
+function gc_set_safepoint_flag(value::SafepointState; overwrite::Bool = true)
+    master_record = get_gc_master_record()
+    warp_id = get_warp_id()
+    safepoint_flag_ptr = master_record.safepoint_flags + sizeof(SafepointState) * (warp_id - 1)
+    if overwrite
+        volatile_store!(safepoint_flag_ptr, value)
+    else
+        atomic_compare_exchange!(safepoint_flag_ptr, not_in_safepoint, value)
+    end
+    return
+end
+
+# Marks a region as a perma-safepoint: the entire region
+# is a safepoint. Note that perma-safepoints are not allowed
+# to include non-perma-safepoints.
+macro perma_safepoint(expr)
+    quote
+        gc_perma_safepoint()
+        local result = $(esc(expr))
+        gc_set_safepoint_flag(not_in_safepoint)
+        result
+    end
+end
+
+# Tries to use a free-list entry to allocate a chunk of data of size `bytesize`,
+# producing an appropriately-sized free list entry that prefixes the data. This
+# entry is removed from the free list but not yet added to the allocation list.
+function gc_take_list_entry(
+    entry_ptr::Ptr{Ptr{FreeListRecord}},
+    entry::Ptr{FreeListRecord},
+    bytesize::Csize_t)::Ptr{FreeListRecord}
+
+    entry_data = unsafe_load(entry)
+    if entry_data.size < bytesize
+        # The entry is just too small. Return a `null` pointer.
+        return C_NULL
+    end
+
+    # The entry's big enough, so we'll use it. If at all possible, we want
+    # to create a new entry from any unused memory in the entry.
+
+    # Compute the address to return.
+    data_address = data_pointer(entry)
+
+    # Compute the end of the free memory chunk.
+    end_address = data_address + entry_data.size
+
+    # Compute the start address of the new free list entry. The data
+    # prefixed by the block needs to be aligned to a 16-byte boundary,
+    # but the block itself doesn't.
+    new_data_address = align_downward(data_address + bytesize)
+    new_entry_address = new_data_address - sizeof(FreeListRecord)
+    if new_entry_address < data_address + bytesize
+        new_entry_address += gc_align
+        new_data_address += gc_align
+    end
+
+    # If we can place a new entry just past the allocation, then we should
+    # by all means do so.
+    if new_data_address < end_address
+        # Create a new free list entry.
+        new_entry_size = Csize_t(end_address) - Csize_t(new_data_address)
+        new_entry_ptr = Base.unsafe_convert(Ptr{FreeListRecord}, new_entry_address)
+        unsafe_store!(
+            new_entry_ptr,
+            FreeListRecord(new_entry_size, entry_data.next))
+
+        # Update this entry's `size` field to reflect the new entry's space
+        # requirements.
+        unsafe_store!(
+            @get_field_pointer(entry, :size)::Ptr{Csize_t},
+            Csize_t(new_entry_address) - Csize_t(data_address))
+
+        # Update the free list pointer.
+        unsafe_store!(entry_ptr, new_entry_ptr)
+    else
+        # We can't create a new entry, but we still have to update the free
+        # list pointer.
+        unsafe_store!(entry_ptr, entry_data.next)
+    end
+
+    return entry
+end
+
+# Prepends a free list record to a free list.
+function gc_add_to_free_list(
+    entry::Ptr{FreeListRecord},
+    list_ptr::Ptr{Ptr{FreeListRecord}})
+
+    # Set the `next` pointer to the value stored at the allocation list pointer.
+    unsafe_store!(
+        @get_field_pointer(entry, :next)::Ptr{Ptr{FreeListRecord}},
+        unsafe_load(list_ptr))
+
+    # Update the allocation list pointer to point to the entry.
+    unsafe_store!(list_ptr, entry)
+end
+
+# Tries to allocate a chunk of memory from a free list.
+# Returns a null pointer if no sufficiently large chunk of
+# memory can be found.
+# If the result is non-null, then a free list record is
+# returned that has been taken from the free list but not
+# yet added to another list.
+function gc_take_any_list_entry(
+    free_list_ptr::Ptr{Ptr{FreeListRecord}},
+    bytesize::Csize_t)::Ptr{FreeListRecord}
+
+    # To allocate memory, we will walk the free list until we find a suitable candidate.
+    while true
+        free_list_item = unsafe_load(free_list_ptr)
+
+        if free_list_item == C_NULL
+            return C_NULL
+        end
+
+        result = gc_take_list_entry(free_list_ptr, free_list_item, bytesize)
+        if result != C_NULL
+            return result
+        end
+
+        free_list_ptr = @get_field_pointer(free_list_item, :next)::Ptr{Ptr{FreeListRecord}}
+    end
+end
+
+# Tries to allocate a chunk of memory from a free list.
+# Returns a null pointer if no sufficiently large chunk of
+# memory can be found.
+#
+# This function is not thread-safe.
+function gc_malloc_from_free_list(arena::Ptr{FreeListArena}, bytesize::Csize_t)::Ptr{UInt8}
+    free_list_ptr = @get_field_pointer(arena, :free_list_head)::Ptr{Ptr{FreeListRecord}}
+    allocation_list_ptr = @get_field_pointer(arena, :allocation_list_head)::Ptr{Ptr{FreeListRecord}}
+
+    # Try to take the entry out of the free list.
+    result_entry = gc_take_any_list_entry(free_list_ptr, bytesize)
+    if result_entry == C_NULL
+        # The entry is just too small. Return a `null` pointer.
+        return C_NULL
+    end
+
+    # At this point, all we need to do is update the allocation record to
+    # reflect the fact that it now represents an allocated block instead of
+    # a free block.
+    gc_add_to_free_list(result_entry, allocation_list_ptr)
+
+    return data_pointer(result_entry)
+end
+
+# Writes a pointer to a temporary GC frame. This will keep the pointer
+# from getting collected until the caller has a chance to add it to its
+# own GC frame.
+function gc_protect(pointer::Ptr{UInt8})
+    if pointer != Base.unsafe_convert(Ptr{UInt8}, C_NULL)
+        gc_frame = new_gc_frame(UInt32(1))
+        unsafe_store!(gc_frame, Base.unsafe_convert(ObjectRef, pointer))
+    end
+end
+
+# Tries to allocate a chunk of memory in a particular GC arena.
+# Returns a null pointer if no sufficiently large chunk of
+# memory can be found.
+function gc_malloc_local(arena::Ptr{FreeListArena}, bytesize::Csize_t; acquire_lock=true)::Ptr{UInt8}
+    # Acquire the arena's lock.
+    result_ptr = writer_locked(get_lock(arena); acquire_lock=acquire_lock) do
+        # Allocate a suitable region of memory.
+        gc_malloc_from_free_list(arena, bytesize)
+    end
+
+    # If the resulting pointer is non-null, then we'll write it to a temporary GC frame.
+    # Our reasoning for doing this is that doing so ensures that the allocated memory
+    # won't get collected by the GC before the caller has a chance to add it to its
+    # own GC frame.
+    gc_protect(result_ptr)
+    return result_ptr
+end
+
+# Transfers a block of free memory from one arena to another and then
+# allocates a differently-sized block of memory from the destination
+# arena.
+function gc_transfer_and_malloc(
+    from_arena::Ptr{FreeListArena},
+    to_arena::Ptr{FreeListArena},
+    transfer_bytesize::Csize_t,
+    alloc_bytesize::Csize_t)::Ptr{UInt8}
+
+    from_free_list = @get_field_pointer(from_arena, :free_list_head)::Ptr{Ptr{FreeListRecord}}
+    entry = writer_locked(get_lock(from_arena)) do
+        # Try to take the entry out of the free list.
+        gc_take_any_list_entry(from_free_list, transfer_bytesize)
+    end
+
+    if entry == C_NULL
+        return C_NULL
+    else
+        to_free_list = @get_field_pointer(to_arena, :free_list_head)::Ptr{Ptr{FreeListRecord}}
+        return writer_locked(get_lock(to_arena)) do
+            gc_add_to_free_list(entry, to_free_list)
+            gc_malloc_local(to_arena, alloc_bytesize; acquire_lock=false)
+        end
+    end
+end
+
+"""
+    gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
+
+Allocates a blob of memory that is managed by the garbage collector.
+This function is designed to be called by the device.
+"""
+function gc_malloc(bytesize::Csize_t)::Ptr{UInt8}
+    master_record = get_gc_master_record()
+
+    function allocate()
+        # Try to allocate in the local arena second. If that doesn't
+        # work, we'll move on to the global arena, which is bigger but
+        # is shared by all threads. (We want to minimize contention
+        # on the global arena's lock.)
+        local_arena = get_local_arena()
+        if local_arena != C_NULL
+            local_ptr = gc_malloc_local(local_arena, bytesize)
+            if local_ptr != C_NULL
+                return local_ptr
+            end
+        else
+            # If there is no local arena then we will just have to allocate
+            # from the global arena directly.
+            return gc_malloc_local(master_record.global_arena, bytesize)
+        end
+
+        # Try to use the global arena if all else fails, but only if the chunk
+        # of memory we want to allocate is sufficiently large. Allocating lots of
+        # small chunks in the global arena will result in undue contention and slow
+        # down kernels dramatically.
+        #
+        # If we need to allocate a small chunk of memory but the local arena is
+        # empty, then we will transfer a *much* larger chunk of memory from the global
+        # arena to the local arena. After that we'll allocate in the local arena.
+        min_global_alloc_size = Csize_t(256 * (1 << 10))
+        if bytesize >= min_global_alloc_size
+            local_ptr = gc_malloc_local(master_record.global_arena, bytesize)
+        else
+            local_ptr = gc_transfer_and_malloc(
+                master_record.global_arena,
+                local_arena,
+                min_global_alloc_size,
+                bytesize)
+        end
+        return local_ptr
+    end
+
+    # Try to malloc the object without host intervention.
+    ptr = @perma_safepoint @nocollect allocate()
+    if ptr != C_NULL
+        return ptr
+    end
+
+    # We're out of memory, which means that we need the garbage collector
+    # to step in. Set a perma-safepoint and acquire the interrupt lock.
+    ptr = @perma_safepoint writer_locked(get_interrupt_lock()) do
+        # Try to allocate memory again. This is bound to fail for the
+        # first thread that acquires the interrupt lock, but it is quite
+        # likely to succeed if we are *not* in the first thread that
+        # acquired the garbage collector lock.
+        ptr2 = allocate()
+
+        if ptr2 == C_NULL
+            # We are either the first thread to acquire the interrupt lock
+            # or the additional memory produced by a previous collection has
+            # already been exhausted. Trigger the garbage collector.
+            gc_collect_impl()
+
+            # Try to malloc again.
+            ptr2 = gc_malloc_local(master_record.global_arena, bytesize)
+        end
+        ptr2
+    end
+    if ptr != C_NULL
+        return ptr
+    end
+
+    # Alright, so that was a spectacular failure. Let's just throw an exception.
+    @cuprintf("ERROR: Out of GPU GC memory (trying to allocate %i bytes)\n", bytesize)
+    # throw(OutOfMemoryError())
+    return C_NULL
+end
+
+"""
+    gc_malloc_object(bytesize::Csize_t)
+
+Allocates an object that is managed by the garbage collector.
+This function is designed to be called by the device.
+"""
+function gc_malloc_object(bytesize::Csize_t)
+    unsafe_pointer_to_objref(gc_malloc(bytesize))
+end
+
+# Zero-fills a range of memory.
+function zero_fill!(start_ptr::Ptr{UInt8}, size::Csize_t)
+    ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), start_ptr, 0, size)
+end
+
+# Zero-fills a range of memory.
+function zero_fill!(start_ptr::Ptr{UInt8}, end_ptr::Ptr{UInt8})
+    zero_fill!(start_ptr, Csize_t(end_ptr) - Csize_t(start_ptr))
+end
+
+# Tries to free a block of memory from a particular arena. `record_ptr`
+# must point to a pointer to the GC allocation record to free. It will
+# be updated to point to the next allocation.
+#
+# This function is designed to be called by the host: it does not
+# turn off collections. It can be called by the device, but in that
+# case it should be prefixed by the `@nocollect` macro followed by
+# a write lock acquisition on the arena's lock.
+function gc_free_local(
+    arena::Ptr{FreeListArena},
+    record_ptr::Ptr{Ptr{FreeListRecord}})
+
+    record = unsafe_load(record_ptr)
+    next_record_ptr = @get_field_pointer(record, :next)
+    free_list_head_ptr = @get_field_pointer(arena, :free_list_head)
+
+    # Remove the record from the allocation list.
+    unsafe_store!(record_ptr, unsafe_load(next_record_ptr))
+
+    # Add the record to the free list and update its `next` pointer
+    # (but not in that order).
+    unsafe_store!(next_record_ptr, unsafe_load(free_list_head_ptr))
+    unsafe_store!(free_list_head_ptr, record)
+
+    # Zero-fill the newly freed block of memory.
+    zero_fill!(data_pointer(record), unsafe_load(@get_field_pointer(record, :size)))
+end
+
+# Like 'gc_collect', but does not acquire the interrupt lock.
+function gc_collect_impl()
+    interrupt_or_wait()
+    threadfence_system()
+end
+
+"""
+    gc_collect()
+
+Triggers a garbage collection phase. This function is designed
+to be called by the device rather than by the host.
+"""
+function gc_collect()
+    writer_locked(gc_collect_impl, get_interrupt_lock())
+end
+
+# One megabyte.
+const MiB = 1 << 20
+
+# A description of a region of memory that has been allocated to the GC heap.
+const GCHeapRegion = CUDAdrv.Mem.HostBuffer
+
+# A description of all memory that has been allocated to the GC heap.
+struct GCHeapDescription
+    # A list of the set of regions that comprise the GC heap.
+    regions::Array{GCHeapRegion, 1}
+end
+
+GCHeapDescription() = GCHeapDescription([])
+
+# A data structure that contains GC configuration parameters.
+struct GCConfiguration
+    # The number of local arenas to create.
+    local_arena_count::Int
+
+    # The max number of roots that can be stored per thread.
+    root_buffer_capacity::Int
+
+    # The point at which the global arena is deemed to be starving, i.e.,
+    # it no longer contains enough memory to perform basic allocations.
+    # If the global arena's free byte count stays below the arena starvation
+    # threshold after a collection phase, the collector will allocate
+    # additional memory to the arena such that it is no longer starving.
+    global_arena_starvation_threshold::Int
+
+    # The initial size of the global arena, in bytes.
+    global_arena_initial_size::Int
+
+    # The point at which a local arena is deemed to be starving, i.e.,
+    # it no longer contains enough memory to perform basic allocations.
+    # If a local arena's free byte count stays below the arena starvation
+    # threshold after a collection phase, the collector will allocate
+    # additional memory to the arena such that it is no longer starving.
+    local_arena_starvation_threshold::Int
+
+    # The initial size of a local arena, in bytes.
+    local_arena_initial_size::Int
+end
+
+# Creates a GC configuration.
+function GCConfiguration(;
+    local_arena_count::Integer = 8,
+    root_buffer_capacity::Integer = 256,
+    global_arena_starvation_threshold::Integer = 4 * MiB,
+    global_arena_initial_size::Integer = 2 * MiB,
+    local_arena_starvation_threshold::Integer = 1 * MiB,
+    local_arena_initial_size::Integer = 1 * MiB)
+
+    GCConfiguration(
+        local_arena_count,
+        root_buffer_capacity,
+        global_arena_starvation_threshold,
+        global_arena_initial_size,
+        local_arena_starvation_threshold,
+        local_arena_initial_size)
+end
+
+function initial_heap_size(config::GCConfiguration, thread_count::Integer)
+    warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device()))
+    local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count
+    safepoint_bytesize = sizeof(SafepointState) * warp_count
+    fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
+    rootbuf_bytesize = sizeof(ObjectRef) * config.root_buffer_capacity * thread_count
+
+    result = 0
+    result += local_arenas_bytesize
+    result += safepoint_bytesize
+    result += fingerbuf_bytesize
+    result += rootbuf_bytesize
+    result += config.local_arena_count * config.local_arena_initial_size
+    result += config.global_arena_initial_size
+    return result
+end
+
+# Initializes a GC heap and produces a master record.
+function gc_init!(
+    heap::GCHeapDescription,
+    config::GCConfiguration,
+    thread_count::Integer)::GCMasterRecord
+
+    warp_count = Base.ceil(UInt32, thread_count / CUDAdrv.warpsize(device()))
+
+    master_region = heap.regions[1]
+
+    gc_memory_start_ptr = pointer(master_region)
+    gc_memory_end_ptr = pointer(master_region) + sizeof(master_region)
+
+    # Allocate a local arena pointer buffer.
+    local_arenas_bytesize = sizeof(Ptr{LocalArena}) * config.local_arena_count
+    local_arenas_ptr = Base.unsafe_convert(Ptr{Ptr{LocalArena}}, gc_memory_start_ptr)
+
+    # Allocate the safepoint flag buffer.
+    safepoint_bytesize = sizeof(SafepointState) * warp_count
+    safepoint_ptr = Base.unsafe_convert(Ptr{SafepointState}, local_arenas_ptr + local_arenas_bytesize)
+
+    # Allocate root buffers.
+    fingerbuf_bytesize = sizeof(Ptr{ObjectRef}) * thread_count
+    fingerbuf_ptr = Base.unsafe_convert(Ptr{Ptr{ObjectRef}}, safepoint_ptr + fingerbuf_bytesize)
+    rootbuf_bytesize = sizeof(ObjectRef) * config.root_buffer_capacity * thread_count
+    rootbuf_ptr = Base.unsafe_convert(Ptr{ObjectRef}, fingerbuf_ptr + fingerbuf_bytesize)
+
+    # Populate the root buffer fingers.
+    for i in 1:thread_count
+        unsafe_store!(fingerbuf_ptr, rootbuf_ptr + (i - 1) * sizeof(ObjectRef) * config.root_buffer_capacity, i)
+    end
+
+    # Compute a pointer to the start of the tiny arena.
+    arena_start_ptr = rootbuf_ptr + rootbuf_bytesize
+
+    # Set up local arenas.
+    for i in 1:config.local_arena_count
+        local_arena = make_gc_arena!(LocalArena, arena_start_ptr, Csize_t(config.local_arena_initial_size))
+        unsafe_store!(local_arenas_ptr, local_arena, i)
+        arena_start_ptr += config.local_arena_initial_size
+    end
+
+    # Set up the global arena.
+    global_arena = make_gc_arena!(GlobalArena, arena_start_ptr, Csize_t(gc_memory_end_ptr) - Csize_t(arena_start_ptr))
+
+    return GCMasterRecord(
+        warp_count,
+        UInt32(thread_count),
+        UInt32(config.root_buffer_capacity),
+        UInt32(config.local_arena_count),
+        local_arenas_ptr,
+        global_arena,
+        safepoint_ptr,
+        fingerbuf_ptr,
+        rootbuf_ptr)
+end
+
+# Takes a zero-filled region of memory and turns it into a block
+# managed by the GC, prefixed with an allocation record.
+function make_gc_block!(start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListRecord} where T
+    entry = Base.unsafe_convert(Ptr{FreeListRecord}, start_ptr)
+    unsafe_store!(
+        entry,
+        FreeListRecord(
+            Csize_t(start_ptr + size) - Csize_t(data_pointer(entry)),
+            C_NULL))
+    return entry
+end
+
+# Takes a zero-filled region of memory and turns it into an arena
+# managed by the GC, prefixed with an arena record.
+function make_gc_arena!(::Type{FreeListArena}, start_ptr::Ptr{T}, size::Csize_t)::Ptr{FreeListArena} where T
+    # Create a single free list entry.
+    first_entry_ptr = make_gc_block!(start_ptr + sizeof(FreeListArena), size - sizeof(FreeListArena))
+
+    # Set up the arena record.
+    arena = Base.unsafe_convert(Ptr{FreeListArena}, start_ptr)
+    unsafe_store!(
+        arena,
+        FreeListArena(0, first_entry_ptr, C_NULL))
+
+    arena
+end
+
+# Tells if a GC heap contains a particular pointer.
+function contains(heap::GCHeapDescription, pointer::Ptr{T}) where T
+    for region in heap.regions
+        if pointer >= pointer(region) && pointer < pointer(region) + sizeof(region)
+            return true
+        end
+    end
+    return false
+end
+
+# Expands the GC heap by allocating a region of memory and adding it to
+# the list of allocated regions. `size` describes the amount of bytes to
+# allocate. Returns the allocated region.
+function expand!(heap::GCHeapDescription, size::Integer)::GCHeapRegion
+    region = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, size, CUDAdrv.Mem.HOSTALLOC_DEVICEMAP)
+    push!(heap.regions, region)
+    return region
+end
+
+# Frees all memory allocated by a GC heap.
+function free!(heap::GCHeapDescription)
+    for region in heap.regions
+        CUDAdrv.Mem.free(region)
+    end
+end
+
+# A sorted list of all allocation records for allocated blocks.
+# This data structure is primarily useful for rapidly mapping
+# pointers to the blocks allocated blocks that contain them.
+struct SortedAllocationList
+    # An array of pointers to allocation records. The pointers
+    # are all sorted.
+    records::Array{Ptr{FreeListRecord}, 1}
+end
+
+length(alloc_list::SortedAllocationList) = length(alloc_list.records)
+
+# Gets a pointer to the allocation record that manages the memory
+# pointed to by `pointer`. Returns a null pointer if there is no
+# such record.
+function get_record(
+    alloc_list::SortedAllocationList,
+    pointer::Ptr{T})::Ptr{FreeListRecord} where T
+
+    # Deal with these cases quickly so we can assume that the
+    # free list is nonempty.
+    if length(alloc_list) == 0 ||
+        pointer < data_pointer(alloc_list.records[1]) ||
+        pointer >= data_end_pointer(alloc_list.records[end])
+
+        return C_NULL
+    end
+
+    # To quickly narrow down the search space, we will do a binary search
+    # for the biggest allocation record pointer that is smaller than `pointer`.
+    range_start, range_end = 1, length(alloc_list)
+    while range_end - range_start > 4
+        range_mid = div(range_start + range_end, 2)
+        mid_val = alloc_list.records[range_mid]
+        if mid_val > pointer
+            range_end = range_mid
+        else
+            range_start = range_mid
+        end
+    end
+
+    # Make sure that the pointer actually points to a region of memory
+    # that is managed by the candidate record we found.
+    for record in alloc_list.records[range_start:range_end]
+        if pointer >= data_pointer(record) && pointer < data_end_pointer(record)
+            return record
+        end
+    end
+    return C_NULL
+end
+
+# Iterates through a linked list of allocation records and apply a function
+# to every node in the linked list.
+function iterate_allocation_records(fun::Function, head::Ptr{FreeListRecord})
+    while head != C_NULL
+        fun(head)
+        head = unsafe_load(head).next
+    end
+end
+
+# Iterates through all active allocation records in a GC arena.
+function iterate_allocated(fun::Function, arena::Ptr{FreeListArena})
+    allocation_list_head = unsafe_load(arena).allocation_list_head
+    iterate_allocation_records(fun, allocation_list_head)
+end
+
+# Iterates through all free allocation records in a GC arena.
+function iterate_free(fun::Function, arena::Ptr{FreeListArena})
+    free_list_head = unsafe_load(arena).free_list_head
+    iterate_allocation_records(fun, free_list_head)
+end
+
+# Takes a GC master record and constructs a sorted allocation list
+# based on it.
+function sort_allocation_list(master_record::GCMasterRecord)::SortedAllocationList
+    records = []
+    iterate_arenas(master_record) do arena
+        iterate_allocated(arena) do record
+            push!(records, record)
+        end
+    end
+    sort!(records)
+    return SortedAllocationList(records)
+end
+
+# Frees all dead blocks in an arena.
+function gc_free_garbage(arena::Ptr{FreeListArena}, live_blocks::Set{Ptr{FreeListRecord}})
+    record_ptr = @get_field_pointer(arena, :allocation_list_head)
+    while true
+        record = unsafe_load(record_ptr)
+        if record == C_NULL
+            # We've reached the end of the list.
+            break
+        end
+
+        if record in live_blocks
+            # We found a live block. Proceed to the next block.
+            record_ptr = @get_field_pointer(record, :next)
+        else
+            # We found a dead block. Release it. Don't proceed to the
+            # next block because the current block will change in the
+            # next iteration of this loop.
+            gc_free_local(arena, record_ptr)
+        end
+    end
+end
+
+# Compact a GC arena's free list. This function will
+#   1. merge adjancent free blocks, and
+#   2. reorder free blocks to put small blocks at the front
+#      of the free list,
+#   3. tally the total number of free bytes and return that number.
+function gc_compact(arena::Ptr{FreeListArena})::Csize_t
+    # Let's start by creating a list of all free list records.
+    records = Ptr{FreeListRecord}[]
+    iterate_free(arena) do record
+        push!(records, record)
+    end
+
+    # We now sort those records and loop through the sorted list,
+    # merging free list entries as we go along.
+    sort!(records)
+
+    i = 1
+    while i < length(records)
+        first_record = records[i]
+        second_record = records[i + 1]
+        if data_end_pointer(first_record) == Base.unsafe_convert(Ptr{UInt8}, second_record)
+            # We found two adjacent free list entries. Expand the first
+            # record's size to encompass both entries, zero-fill the second
+            # record's header and delete it from the list of records.
+            new_size = Csize_t(data_end_pointer(second_record)) - Csize_t(data_pointer(first_record))
+            zero_fill!(data_end_pointer(first_record), data_pointer(second_record))
+            unsafe_store!(@get_field_pointer(first_record, :size), new_size)
+            deleteat!(records, i + 1)
+        else
+            i += 1
+        end
+    end
+
+    # Now sort the records based on size. Put the smallest records first to
+    # discourage fragmentation.
+    sort!(records; lt = (x, y) -> unsafe_load(x).size < unsafe_load(y).size)
+
+    # Reconstruct the free list as a linked list.
+    prev_record_ptr = @get_field_pointer(arena, :free_list_head)
+    for record in records
+        unsafe_store!(prev_record_ptr, record)
+        prev_record_ptr = @get_field_pointer(record, :next)
+    end
+    unsafe_store!(prev_record_ptr, C_NULL)
+
+    # Compute the total number of free bytes.
+    return sum(map(record -> unsafe_load(record).size, records))
+end
+
+# Expands a GC arena by assigning it an additional heap region.
+function gc_expand(arena::Ptr{FreeListArena}, region::GCHeapRegion)
+    extra_record = make_gc_block!(pointer(region), Csize_t(sizeof(region)))
+    last_free_list_ptr = @get_field_pointer(arena, :free_list_head)
+    iterate_free(arena) do record
+        last_free_list_ptr = @get_field_pointer(record, :next)
+    end
+    unsafe_store!(last_free_list_ptr, extra_record)
+end
+
+"""A report of the GC's actions."""
+mutable struct GCReport
+    """The total wall-clock time of a kernel execution."""
+    elapsed_time::Float64
+
+    """The number of collections that were performed."""
+    collection_count::Int
+
+    """The total wall-clock time of all collection polls."""
+    collection_poll_time::Float64
+
+    """The total wall-clock time of all collections."""
+    collection_time::Float64
+
+    """The total amount of additional memory allocated to local pools."""
+    extra_local_memory::Csize_t
+
+    """The total amount of additional memory allocated to the global pool."""
+    extra_global_memory::Csize_t
+
+    GCReport() = new(0.0, 0, 0.0, 0.0, Csize_t(0), Csize_t(0))
+end
+
+function show(io::IO, report::GCReport)
+    print(io, "[wall-clock time: $(@sprintf("%.4f", report.elapsed_time)) s; ")
+    print(io, "collections: $(report.collection_count); ")
+    poll_percentage = 100 * report.collection_poll_time / report.elapsed_time
+    print(io, "total poll time: $(@sprintf("%.4f", report.collection_poll_time)) s ($(@sprintf("%.2f", poll_percentage))%); ")
+    collection_percentage = 100 * report.collection_time / report.elapsed_time
+    print(io, "total collection time: $(@sprintf("%.4f", report.collection_time)) s ($(@sprintf("%.2f", collection_percentage))%); ")
+    print(io, "extra local memory: $(div(report.extra_local_memory, MiB)) MiB; ")
+    print(io, "extra global memory: $(div(report.extra_global_memory, MiB)) MiB]")
+end
+
+# Collects garbage. This function is designed to be called by the host,
+# not by the device.
+function gc_collect_impl(master_record::GCMasterRecord, heap::GCHeapDescription, config::GCConfiguration, report::GCReport)
+    poll_time = Base.@elapsed begin
+        # First off, we have to wait for all warps to reach a safepoint. Clear
+        # safepoint flags and wait for warps to set them again.
+        for i in 0:(master_record.warp_count - 1)
+            atomic_compare_exchange!(
+                master_record.safepoint_flags + i * sizeof(SafepointState),
+                in_safepoint,
+                not_in_safepoint)
+        end
+        safepoint_count = 0
+        while safepoint_count != master_record.warp_count
+            safepoint_count = 0
+            for i in 0:(master_record.warp_count - 1)
+                state = volatile_load(master_record.safepoint_flags + i * sizeof(SafepointState))
+                if state != not_in_safepoint
+                    safepoint_count += 1
+                end
+            end
+        end
+    end
+
+    collection_time = Base.@elapsed begin
+
+        # The Julia CPU GC is precise and the information it uses for precise
+        # garbage collection is stored in memory that we should be able to access.
+        # However, the way the CPU GC stores field information is incredibly
+        # complicated and replicating that logic here would be a royal pain to
+        # implement and maintain. Ideally, the CPU GC would expose an interface that
+        # allows us to point to an object and ask the GC for all GC-tracked pointers
+        # it contains. Alas, no such luck: the CPU GC doesn't even have an internal
+        # function that does that. The CPU GC's logic for finding GC-tracked pointer
+        # fields is instead fused tightly with its 'mark' loop.
+        #
+        # To cope with this, we will simply implement a semi-conservative GC: we precisely
+        # scan the roots for pointers into the GC heap. We then recursively mark blocks
+        # that are pointed to by such pointers as live and conservatively scan them for
+        # more pointers.
+        #
+        # Our mark phase is fairly simple: we maintain a worklist of pointers that
+        # are live and may need to be processed, as well as a set of blocks that are
+        # live and have already been processed.
+        live_blocks = Set{Ptr{FreeListRecord}}()
+        live_worklist = Ptr{ObjectRef}[]
+
+        # Get a sorted allocation list, which will allow us to classify live pointers quickly.
+        alloc_list = sort_allocation_list(master_record)
+
+        # Add all roots to the worklist.
+        for i in 1:(master_record.root_buffer_capacity * master_record.thread_count)
+            root = unsafe_load(master_record.root_buffers, i)
+            if root != C_NULL
+                push!(live_worklist, root)
+            end
+        end
+
+        # Now process all live pointers until we reach a fixpoint.
+        while !isempty(live_worklist)
+            # Pop a pointer from the worklist.
+            object_ref = pop!(live_worklist)
+            # Get the block for that pointer.
+            record = get_record(alloc_list, object_ref)
+            # Make sure that we haven't visited the block yet.
+            if record != C_NULL && !(record in live_blocks)
+                # Mark the block as live.
+                push!(live_blocks, record)
+                # Add all pointer-sized, aligned values to the live pointer worklist.
+                for ptr in data_pointer(record):sizeof(ObjectRef):data_end_pointer(record) - 1
+                    value = unsafe_load(Base.unsafe_convert(Ptr{ObjectRef}, ptr))
+                    push!(live_worklist, value)
+                end
+            end
+        end
+
+        # We're done with the mark phase! Time to proceed to the sweep phase.
+        # The first thing we'll do is iterate through every arena's allocation list and
+        # free dead blocks. Next, we will compact and reorder free lists to combat
+        # fragmentation.
+        iterate_arenas(master_record) do arena
+            # Free garbage blocks.
+            gc_free_garbage(arena, live_blocks)
+
+            # Compact the arena.
+            free_memory = gc_compact(arena)
+
+            # If the amount of free memory in the arena is below the starvation
+            # limit then we'll expand the GC heap and add the additional memory
+            # to the arena's free list.
+            threshold = if arena == master_record.global_arena
+                config.global_arena_starvation_threshold
+            else
+                config.local_arena_starvation_threshold
+            end
+
+            if free_memory < threshold
+                region = expand!(heap, threshold)
+                gc_expand(arena, region)
+
+                if arena == master_record.global_arena
+                    report.extra_global_memory += Csize_t(threshold)
+                else
+                    report.extra_local_memory += Csize_t(threshold)
+                end
+            end
+        end
+    end
+    report.collection_count += 1
+    report.collection_time += collection_time
+    report.collection_poll_time += poll_time
+end
+
+# Examines a keyword argument list and gets either the value
+# assigned to a key or a default value.
+function get_kwarg_or_default(kwarg_list, key::Symbol, default)
+    for kwarg in kwarg_list
+        arg_key, val = kwarg.args
+        if arg_key == key
+            return val
+        end
+    end
+    return default
+end
diff --git a/src/interrupts.jl b/src/interrupts.jl
new file mode 100644
index 00000000..5251af4d
--- /dev/null
+++ b/src/interrupts.jl
@@ -0,0 +1,252 @@
+# This file implements a high-level generic device-to-host interrupt
+# mechanism. This file also contains non-trivial support infrastructure
+# that should either be moved to CUDAdrv or exposed by CUDAnative.
+# Note that this support infrastructure is not exported, so it remains
+# an implementation detail as opposed to a part of CUDAnative's public
+# API.
+
+import CUDAdrv: @apicall
+
+export @cuda_interruptible, interrupt, interrupt_or_wait, wait_for_interrupt
+
+# Queries a stream for its status.
+function query_stream(stream::CUDAdrv.CuStream = CuDefaultStream())::Cint
+    return ccall(
+        (:cuStreamQuery, CUDAdrv.libcuda),
+        Cint,
+        (CUDAdrv.CuStream_t,),
+        stream)
+end
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T}
+    T_global = convert(LLVMType, T)
+    T_result = convert(LLVMType, Ptr{T})
+
+    # Create a thunk that computes a pointer to the global.
+    llvm_f, _ = create_function(T_result)
+    mod = LLVM.parent(llvm_f)
+
+    # Figure out if the global has been defined already.
+    global_set = LLVM.globals(mod)
+    global_name_string = String(global_name)
+    if haskey(global_set, global_name_string)
+        global_var = global_set[global_name_string]
+    else
+        # If the global hasn't been defined already, then we'll define
+        # it in the global address space, i.e., address space one.
+        global_var = GlobalVariable(mod, T_global, global_name_string, 1)
+        linkage!(global_var, LLVM.API.LLVMLinkOnceAnyLinkage)
+        initializer!(global_var, LLVM.null(T_global))
+    end
+
+    # Generate IR that computes the global's address.
+    Builder(JuliaContext()) do builder
+        entry = BasicBlock(llvm_f, "entry", JuliaContext())
+        position!(builder, entry)
+
+        # Cast the global variable's type to the result type.
+        result = ptrtoint!(builder, global_var, T_result)
+        ret!(builder, result)
+    end
+
+    # Call the function.
+    call_function(llvm_f, Ptr{T})
+end
+
+macro cuda_global_ptr(name, type)
+    return :(get_global_pointer(
+        $(Val(Symbol(name))),
+        $(esc(type))))
+end
+
+# Gets a pointer to the interrupt region.
+@inline function get_interrupt_pointer()::Ptr{UInt32}
+    # Compute a pointer to the global in which a pointer to the
+    # interrupt state is stored.
+    ptr = @cuda_global_ptr("interrupt_pointer", Ptr{UInt32})
+    # state the pointer, netting us a pointer to the interrupt
+    # region.
+    return Base.unsafe_load(ptr)
+end
+
+# The interrupt state is a 32-bit unsigned integer that
+# can have one of the following values:
+#
+#   * 0: host is ready to process an interrupt, no interrupt
+#        is currently being processed.
+#   * 1: device has requested an interrupt, the interrupt
+#        has not completed processing yet.
+#
+const ready = UInt32(0)
+const processing = UInt32(1)
+
+"""
+    interrupt_or_wait()
+
+Requests an interrupt and waits until the interrupt completes.
+If an interrupt is already running, then this function waits
+for that interrupt to complete, but does not request an interrupt
+of its own. Returns `true` if an interrupt was successfully
+requested by this function; otherwise, `false`.
+"""
+function interrupt_or_wait()::Bool
+    state_ptr = get_interrupt_pointer()
+    prev_state = atomic_compare_exchange!(state_ptr, ready, processing)
+    wait_for_interrupt()
+    return prev_state == ready
+end
+
+"""
+    wait_for_interrupt(fun::Function)
+
+Waits for the current interrupt to finish, if an interrupt is
+currently running. A function is repeatedly executed until the
+interrupt finishes.
+"""
+function wait_for_interrupt(fun::Function)
+    state_ptr = get_interrupt_pointer()
+    while volatile_load(state_ptr) == processing
+        fun()
+    end
+end
+
+"""
+    wait_for_interrupt()
+
+Waits for the current interrupt to finish, if an interrupt is
+currently running.
+"""
+function wait_for_interrupt()
+    wait_for_interrupt() do
+    end
+end
+
+"""
+    interrupt()
+
+Repeatedly requests an interrupt until one is requested successfully.
+"""
+function interrupt()
+    while !interrupt_or_wait()
+    end
+end
+
+# Waits for the current kernel to terminate and handle
+# any interrupts that we encounter along the way.
+function handle_interrupts(handler::Function, state::Ptr{UInt32}, stream::CuStream = CuDefaultStream())
+    while true
+        # Sleep to save processing power.
+        sleep(0.001)
+
+        # Query the CUDA stream.
+        status = query_stream(stream)
+        if status == CUDAdrv.SUCCESS.code
+            # The kernel has finished running. We're done here.
+            return
+        elseif status == CUDAdrv.ERROR_NOT_READY.code
+            # The kernel is still running. Check if an interrupt
+            # needs handling.
+            if volatile_load(state) == processing
+                # Run the handler.
+                handler()
+                # Set the interrupt state to 'ready'.
+                volatile_store!(state, ready)
+            end
+
+            # Continue querying the stream.
+        else
+            # Whoa. Something both unexpected and unpleasant seems
+            # to have happened. Better throw an exception here.
+            throw(CuError(status))
+        end
+    end
+end
+
+"""
+    @cuda_interruptible [kwargs...] func(args...)
+
+High-level interface for executing code on a GPU with support for interrupts.
+The `@cuda_interruptible` macro should prefix a call, with `func` a callable function
+or object that should return nothing. It will be compiled to a CUDA function upon first
+use, and to a certain extent arguments will be converted and anaged automatically using
+`cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel
+launch on the current CUDA context.
+
+Several keyword arguments are supported that influence kernel compilation and execution. For
+more information, refer to the documentation of respectively [`cufunction`](@ref) and
+[`CUDAnative.Kernel`](@ref).
+"""
+macro cuda_interruptible(handler, ex...)
+    # destructure the `@cuda_interruptible` expression
+    if length(ex) > 0 && ex[1].head == :tuple
+        error("The tuple argument to @cuda has been replaced by keywords: `@cuda_interruptible handler threads=... fun(args...)`")
+    end
+    call = ex[end]
+    kwargs = ex[1:end-1]
+
+    # destructure the kernel call
+    if call.head != :call
+        throw(ArgumentError("second argument to @cuda_interruptible should be a function call"))
+    end
+    f = call.args[1]
+    args = call.args[2:end]
+
+    code = quote end
+    env_kwargs, compiler_kwargs, call_kwargs = CUDAnative.split_kwargs(kwargs)
+    vars, var_exprs = CUDAnative.assign_args!(code, args)
+
+    # Find the stream on which the kernel is to be scheduled.
+    stream = CuDefaultStream()
+    for kwarg in call_kwargs
+        key, val = kwarg.args
+        if key == :stream
+            stream = val
+        end
+    end
+
+    # convert the arguments, call the compiler and launch the kernel
+    # while keeping the original arguments alive
+    push!(code.args,
+        quote
+            GC.@preserve $(vars...) begin
+                # Define a trivial buffer that contains the interrupt state.
+                local interrupt_buffer = CUDAdrv.Mem.alloc(CUDAdrv.Mem.HostBuffer, sizeof(ready), CUDAdrv.Mem.HOSTALLOC_DEVICEMAP)
+                local interrupt_pointer = Base.unsafe_convert(Ptr{UInt32}, interrupt_buffer)
+                unsafe_store!(interrupt_pointer, ready)
+                local device_interrupt_pointer = Base.unsafe_convert(CuPtr{UInt32}, interrupt_buffer)
+
+                try
+                    # Define a kernel initialization function that sets the
+                    # interrupt state pointer.
+                    local function interrupt_kernel_init(kernel)
+                        try
+                            global_handle = CuGlobal{CuPtr{UInt32}}(kernel.mod, "interrupt_pointer")
+                            set(global_handle, device_interrupt_pointer)
+                        catch exception
+                            # The interrupt pointer may not have been declared (because it is unused).
+                            # In that case, we should do nothing.
+                            if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+                                rethrow()
+                            end
+                        end
+                    end
+
+                    # Standard kernel setup logic.
+                    local kernel_args = CUDAnative.cudaconvert.(($(var_exprs...),))
+                    local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
+                    local kernel = CUDAnative.cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
+                    CUDAnative.prepare_kernel(kernel; init=interrupt_kernel_init, $(map(esc, env_kwargs)...))
+                    kernel(kernel_args...; $(map(esc, call_kwargs)...))
+
+                    # Handle interrupts.
+                    handle_interrupts($(esc(handler)), interrupt_pointer, $(esc(stream)))
+                finally
+                    CUDAdrv.Mem.free(interrupt_buffer)
+                end
+            end
+         end)
+    return code
+end
diff --git a/test/device/gc.jl b/test/device/gc.jl
new file mode 100644
index 00000000..640d5ebf
--- /dev/null
+++ b/test/device/gc.jl
@@ -0,0 +1,70 @@
+@testset "gc" begin
+
+############################################################################################
+
+dummy() = return
+
+dummy_handler(kernel) = return
+
+@testset "@cuda gc=true" begin
+
+@testset "allocate and collect" begin
+    # This test allocates many very small and very large objects. Both the small
+    # and large objects become garbage eventually, but small objects need to
+    # outlive the large objects (and not be collected erroneously) for the test
+    # to pass. So essentially this test tackles three things:
+    #
+    #   1. Allocation works.
+    #   2. Collection works.
+    #   3. Collection isn't gung-ho to the point of incorrectness.
+    #
+
+    mutable struct TempStruct
+        data::Float32
+    end
+
+    @noinline function escape(val)
+        Base.pointer_from_objref(val)
+    end
+
+    # Define a kernel that copies values using a temporary struct.
+    function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
+        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+        for j in 1:2
+            # Allocate a mutable struct and make sure it ends up on the GC heap.
+            temp = TempStruct(unsafe_load(a, i))
+            escape(temp)
+
+            # Allocate a large garbage buffer to force collections.
+            gc_malloc(Csize_t(256 * 1024))
+
+            # Use the mutable struct. If its memory has been reclaimed (by accident)
+            # then we expect the test at the end of this file to fail.
+            unsafe_store!(b, temp.data, i)
+        end
+
+        return
+    end
+
+    thread_count = 64
+
+    # Allocate two arrays.
+    source_array = Mem.alloc(Float32, thread_count)
+    destination_array = Mem.alloc(Float32, thread_count)
+    source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
+    destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
+
+    # Fill the source and destination arrays.
+    Mem.upload!(source_array, fill(42.f0, thread_count))
+    Mem.upload!(destination_array, zeros(Float32, thread_count))
+
+    # Run the kernel.
+    @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)
+
+    @test Mem.download(Float32, destination_array, thread_count) == fill(42.f0, thread_count)
+end
+
+end
+
+end
diff --git a/test/device/interrupts.jl b/test/device/interrupts.jl
new file mode 100644
index 00000000..a07a57e8
--- /dev/null
+++ b/test/device/interrupts.jl
@@ -0,0 +1,87 @@
+@testset "interrupts" begin
+
+############################################################################################
+
+dummy() = return
+
+dummy_handler(kernel) = return
+
+@testset "@cuda_interruptible" begin
+
+@test_throws UndefVarError @cuda_interruptible dummy_handler undefined()
+@test_throws MethodError @cuda_interruptible dummy_handler dummy(1)
+
+@testset "compilation params" begin
+    @cuda_interruptible dummy_handler dummy()
+
+    @test_throws CuError @cuda_interruptible dummy_handler threads=2 maxthreads=1 dummy()
+    @cuda_interruptible dummy_handler threads=2 dummy()
+end
+
+@testset "count" begin
+    # This test uses interrupts to increment a host counter and then
+    # checks that the counter's value equals the number of interrupts.
+    # This is a useful thing to check because it verifies that interrupts
+    # are neither skipped nor performed twice.
+    #
+    # We will use a sizeable number of threads (128) to give us a better
+    # shot at detecting concurrency errors, if any. The number of skipped
+    # interrupts is unlikely to equal the number of additional, unwanted
+    # interrupts for this many threads.
+    thread_count = 128
+
+    # Define a kernel that makes the host count.
+    function increment_counter()
+        interrupt()
+        return
+    end
+
+    # Configure the interrupt to increment a counter.
+    global counter = 0
+    function handle_interrupt()
+        global counter
+        counter += 1
+    end
+
+    # Run the kernel.
+    @cuda_interruptible handle_interrupt threads=thread_count increment_counter()
+
+    # Check that the counter's final value equals the number
+    # of threads.
+    @test counter == thread_count
+end
+
+@testset "count in stream" begin
+    # This test is a copy of the previous test, but it uses a non-default
+    # CUDA stream. This should Just Work: @cuda_interruptible should
+    # intercept the `stream=...` argument and pass it to the stream-querying
+    # logic. All of this should be entirely transparent to the user.
+    thread_count = 128
+
+    # Define a kernel that makes the host count.
+    function increment_counter()
+        interrupt()
+        return
+    end
+
+    # Configure the interrupt to increment a counter.
+    global counter = 0
+    function handle_interrupt()
+        global counter
+        counter += 1
+    end
+
+    # Define a CUDA stream.
+    exec_stream = CuStream()
+
+    # Run the kernel.
+    @cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream increment_counter()
+
+    # Check that the counter's final value equals the number
+    # of threads.
+    @test counter == thread_count
+end
+
+end
+
+end
diff --git a/test/device/threading.jl b/test/device/threading.jl
new file mode 100644
index 00000000..fa9533b1
--- /dev/null
+++ b/test/device/threading.jl
@@ -0,0 +1,91 @@
+@testset "threading" begin
+
+############################################################################################
+
+@testset "reader-writer lock" begin
+
+@testset "writers only" begin
+
+    thread_count = 128
+
+    # Define a kernel that atomically increments a counter using a lock.
+    function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState})
+        lock = ReaderWriterLock(lock_state)
+        writer_locked(lock) do
+            unsafe_store!(counter, unsafe_load(counter) + 1)
+        end
+        return
+    end
+
+    # Allocate memory for the counter and the lock.
+    counter_buf = Mem.alloc(sizeof(Int32))
+    Mem.upload!(counter_buf, [Int32(0)])
+    counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf)
+
+    lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState))
+    Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)])
+    lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf)
+
+    # Run the kernel.
+    @cuda threads=thread_count increment_counter(counter_pointer, lock_pointer)
+
+    # Check that the counter's final value equals the number
+    # of threads.
+    @test Mem.download(Int32, counter_buf) == [Int32(thread_count)]
+
+end
+
+@testset "readers and writers" begin
+
+    thread_count = 128
+
+    # Define a kernel.
+    function mutate_counter_maybe(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.ReaderWriterLockState})
+        i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+        lock = ReaderWriterLock(lock_state)
+        # Read the previous counter and update the current counter.
+        # Do this many times.
+        if i % 16 == 0
+            # Some threads get to atomically increment the counter.
+            writer_locked(lock) do
+                unsafe_store!(counter, unsafe_load(counter) + 1)
+            end
+        else
+            # All the other threads acquire the lock in reader mode
+            # and check that the counter's value doesn't change.
+            reader_locked(lock) do
+                counter_ptr = convert(Ptr{Int32}, convert(Csize_t, counter))
+                counter_val = CUDAnative.volatile_load(counter_ptr)
+                j = 0
+                while j < 10
+                    if CUDAnative.volatile_load(counter_ptr) != counter_val
+                        throw(ErrorException("oh no"))
+                    end
+                    j += 1
+                end
+            end
+        end
+        return
+    end
+
+    # Allocate memory for the counter and the lock.
+    counter_buf = Mem.alloc(sizeof(Int32))
+    Mem.upload!(counter_buf, [Int32(0)])
+    counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf)
+
+    lock_buf = Mem.alloc(sizeof(CUDAnative.ReaderWriterLockState))
+    Mem.upload!(lock_buf, [CUDAnative.ReaderWriterLockState(0)])
+    lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.ReaderWriterLockState}, lock_buf)
+
+    # Run the kernel.
+    @cuda threads=thread_count mutate_counter_maybe(counter_pointer, lock_pointer)
+
+    # Check that the counter's final value equals the number
+    # of threads.
+    @test Mem.download(Int32, counter_buf) == [Int32(thread_count / 16)]
+
+end
+
+end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index f4346620..d124cef9 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -68,9 +68,12 @@ else
     else
         include("device/codegen.jl")
         include("device/execution.jl")
+        include("device/interrupts.jl")
         include("device/pointer.jl")
         include("device/array.jl")
         include("device/cuda.jl")
+        include("device/threading.jl")
+        include("device/gc.jl")
 
         include("examples.jl")
     end