-
Notifications
You must be signed in to change notification settings - Fork 6
Open
Labels
performanceMust go fasterMust go faster
Description
Reported by @matthias314 on discourse, summing a FixedSizeVector seems to be slightly slower than a Vector:
julia> for n in (0, 2^0, 2^5, 2^10, 2^15, 2^20)
@info "n = $(n)"
@eval @btime sum(v) setup=(Random.seed!(1); v=rand!(Vector{Float64}(undef, $(n))))
@eval @btime sum(v) setup=(Random.seed!(1); v=rand!(FixedSizeVector{Float64}(undef, $(n))))
end
[ Info: n = 0
2.416 ns (0 allocations: 0 bytes)
1.166 ns (0 allocations: 0 bytes)
[ Info: n = 1
2.125 ns (0 allocations: 0 bytes)
2.708 ns (0 allocations: 0 bytes)
[ Info: n = 32
9.593 ns (0 allocations: 0 bytes)
9.885 ns (0 allocations: 0 bytes)
[ Info: n = 1024
101.183 ns (0 allocations: 0 bytes)
103.459 ns (0 allocations: 0 bytes)
[ Info: n = 32768
3.318 μs (0 allocations: 0 bytes)
3.427 μs (0 allocations: 0 bytes)
[ Info: n = 1048576
117.416 μs (0 allocations: 0 bytes)
117.666 μs (0 allocations: 0 bytes)The difference isn't dramatic, but mostly systematic. But what really puzzles me is that I can't tell why. The LLVM IR looks nearly identical for the two containers, except for the fact FixedSizeVector needs to construct the MemoryRef, which is something which came up already in #70.
julia> code_llvm(Base.mapreduce_impl, (typeof(identity), typeof(Base.add_sum), Vector{Float64}, Int, Int, Int); )LLVM IR for Vector
; Function Signature: mapreduce_impl(typeof(Base.identity), typeof(Base.add_sum), Array{Float64, 1}, Int64, Int64, Int64)
; @ reduce.jl:251 within `mapreduce_impl`
; Function Attrs: noinline
define double @julia_mapreduce_impl_7334(ptr noundef nonnull align 8 dereferenceable(24) %"A::Array", i64 signext %"ifirst::Int64", i64 signext %"ilast::Int64", i64 signext %"blksize::Int64") local_unnamed_addr #0 {
top:
; @ reduce.jl:253 within `mapreduce_impl`
; ┌ @ promotion.jl:637 within `==`
%.not = icmp eq i64 %"ifirst::Int64", %"ilast::Int64"
; └
br i1 %.not, label %L17, label %L22
L17: ; preds = %top
; @ reduce.jl:254 within `mapreduce_impl`
; ┌ @ essentials.jl:953 within `getindex`
%memoryref_data = load ptr, ptr %"A::Array", align 8
%0 = getelementptr double, ptr %memoryref_data, i64 %"ifirst::Int64"
%memoryref_data1 = getelementptr i8, ptr %0, i64 -8
%1 = load double, ptr %memoryref_data1, align 8
; └
; @ reduce.jl:255 within `mapreduce_impl`
br label %common.ret
common.ret: ; preds = %L126, %L83, %middle.block, %L39, %L17
%common.ret.op = phi double [ %1, %L17 ], [ %32, %L126 ], [ %6, %L39 ], [ %23, %middle.block ], [ %26, %L83 ]
; @ reduce.jl within `mapreduce_impl`
ret double %common.ret.op
L22: ; preds = %top
; @ reduce.jl:256 within `mapreduce_impl`
; ┌ @ int.jl:86 within `-`
%2 = sub i64 %"ilast::Int64", %"ifirst::Int64"
; └
; ┌ @ int.jl:83 within `<`
%.not49 = icmp slt i64 %2, %"blksize::Int64"
; └
br i1 %.not49, label %L39, label %L126
L39: ; preds = %L22
; @ reduce.jl:258 within `mapreduce_impl`
; ┌ @ essentials.jl:953 within `getindex`
%memoryref_data6 = load ptr, ptr %"A::Array", align 8
%3 = getelementptr double, ptr %memoryref_data6, i64 %"ifirst::Int64"
%memoryref_data13 = getelementptr i8, ptr %3, i64 -8
%4 = load double, ptr %memoryref_data13, align 8
; └
; @ reduce.jl:259 within `mapreduce_impl`
; ┌ @ essentials.jl:953 within `getindex`
%5 = load double, ptr %3, align 8
; └
; @ reduce.jl:260 within `mapreduce_impl`
; ┌ @ reduce.jl:19 within `add_sum`
; │┌ @ float.jl:492 within `+`
%6 = fadd double %4, %5
; └└
; @ reduce.jl:261 within `mapreduce_impl`
; ┌ @ simdloop.jl:69 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%7 = add i64 %"ifirst::Int64", 2
; │└
; │┌ @ range.jl:5 within `Colon`
; ││┌ @ range.jl:415 within `UnitRange`
; │││┌ @ range.jl:426 within `unitrange_last`
; ││││┌ @ operators.jl:472 within `>=`
; │││││┌ @ int.jl:522 within `<=`
%.not50 = icmp sgt i64 %7, %"ilast::Int64"
; ││││└└
%8 = add i64 %"ifirst::Int64", 1
%value_phi = select i1 %.not50, i64 %8, i64 %"ilast::Int64"
; │└└└
; │ @ simdloop.jl:71 within `macro expansion`
; │┌ @ simdloop.jl:51 within `simd_inner_length`
; ││┌ @ range.jl:783 within `length`
; │││┌ @ int.jl:86 within `-`
%9 = sub i64 %value_phi, %7
%.not5152 = icmp ult i64 %9, 9223372036854775807
; │└└└
; │ @ simdloop.jl:72 within `macro expansion`
br i1 %.not5152, label %L83.lr.ph, label %common.ret
L83.lr.ph: ; preds = %L39
%10 = getelementptr double, ptr %memoryref_data6, i64 %7
; │ @ simdloop.jl:75 within `macro expansion`
%invariant.gep = getelementptr i8, ptr %10, i64 -8
%11 = xor i64 %"ifirst::Int64", -1
%12 = add i64 %value_phi, %11
%min.iters.check = icmp ult i64 %12, 8
br i1 %min.iters.check, label %L83, label %vector.ph
vector.ph: ; preds = %L83.lr.ph
%n.vec = and i64 %12, -8
%13 = insertelement <2 x double> <double poison, double -0.000000e+00>, double %6, i64 0
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <2 x double> [ %13, %vector.ph ], [ %18, %vector.body ]
%vec.phi55 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %19, %vector.body ]
%vec.phi56 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %20, %vector.body ]
%vec.phi57 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %21, %vector.body ]
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:262
; │┌ @ essentials.jl:953 within `getindex`
%14 = getelementptr double, ptr %invariant.gep, i64 %index
%15 = getelementptr i8, ptr %14, i64 16
%16 = getelementptr i8, ptr %14, i64 32
%17 = getelementptr i8, ptr %14, i64 48
%wide.load = load <2 x double>, ptr %14, align 8
%wide.load58 = load <2 x double>, ptr %15, align 8
%wide.load59 = load <2 x double>, ptr %16, align 8
%wide.load60 = load <2 x double>, ptr %17, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:263
; │┌ @ reduce.jl:19 within `add_sum`
; ││┌ @ float.jl:492 within `+`
%18 = fadd reassoc contract <2 x double> %vec.phi, %wide.load
%19 = fadd reassoc contract <2 x double> %vec.phi55, %wide.load58
%20 = fadd reassoc contract <2 x double> %vec.phi56, %wide.load59
%21 = fadd reassoc contract <2 x double> %vec.phi57, %wide.load60
; │└└
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%index.next = add nuw i64 %index, 8
%22 = icmp eq i64 %index.next, %n.vec
br i1 %22, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
; │└└
; │ @ simdloop.jl:75 within `macro expansion`
%bin.rdx = fadd reassoc contract <2 x double> %19, %18
%bin.rdx61 = fadd reassoc contract <2 x double> %20, %bin.rdx
%bin.rdx62 = fadd reassoc contract <2 x double> %21, %bin.rdx61
%23 = call reassoc contract double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx62)
%cmp.n = icmp eq i64 %12, %n.vec
br i1 %cmp.n, label %common.ret, label %L83
L83: ; preds = %L83, %middle.block, %L83.lr.ph
%value_phi2754 = phi i64 [ %24, %L83 ], [ 0, %L83.lr.ph ], [ %n.vec, %middle.block ]
%value_phi2653 = phi double [ %26, %L83 ], [ %6, %L83.lr.ph ], [ %23, %middle.block ]
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%24 = add nuw nsw i64 %value_phi2754, 1
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:262
; │┌ @ essentials.jl:953 within `getindex`
%gep = getelementptr double, ptr %invariant.gep, i64 %value_phi2754
%25 = load double, ptr %gep, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:263
; │┌ @ reduce.jl:19 within `add_sum`
; ││┌ @ float.jl:492 within `+`
%26 = fadd reassoc contract double %value_phi2653, %25
; │└└
; │ @ simdloop.jl:75 within `macro expansion`
; │┌ @ int.jl:83 within `<`
%exitcond.not = icmp eq i64 %value_phi2754, %9
; │└
br i1 %exitcond.not, label %common.ret, label %L83
L126: ; preds = %L22
; └
; @ reduce.jl:268 within `mapreduce_impl`
; ┌ @ int.jl:542 within `>>` @ int.jl:535
%27 = ashr i64 %2, 1
; └
; ┌ @ int.jl:87 within `+`
%28 = add i64 %27, %"ifirst::Int64"
; └
; @ reduce.jl:269 within `mapreduce_impl`
%29 = call double @julia_mapreduce_impl_7334(ptr %"A::Array", i64 signext %"ifirst::Int64", i64 signext %28, i64 signext %"blksize::Int64")
; @ reduce.jl:270 within `mapreduce_impl`
; ┌ @ int.jl:87 within `+`
%30 = add i64 %28, 1
; └
%31 = call double @julia_mapreduce_impl_7334(ptr %"A::Array", i64 signext %30, i64 signext %"ilast::Int64", i64 signext %"blksize::Int64")
; @ reduce.jl:271 within `mapreduce_impl`
; ┌ @ reduce.jl:19 within `add_sum`
; │┌ @ float.jl:492 within `+`
%32 = fadd double %29, %31
; └└
br label %common.ret
}julia> code_llvm(Base.mapreduce_impl, (typeof(identity), typeof(Base.add_sum), FixedSizeVectorDefault{Float64}, Int, Int, Int); )LLVM IR for FixedSizeVectorDefault
; Function Signature: mapreduce_impl(typeof(Base.identity), typeof(Base.add_sum), FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}}, Int64, Int64, Int64)
; @ reduce.jl:251 within `mapreduce_impl`
; Function Attrs: noinline
define double @julia_mapreduce_impl_7339(ptr nocapture noundef nonnull readonly align 8 dereferenceable(16) %"A::FixedSizeArray", ptr nocapture readonly %.roots.A, i64 signext %"ifirst::Int64", i64 signext %"ilast::Int64", i64 signext %"blksize::Int64") local_unnamed_addr #0 {
top:
%gcframe1 = alloca [4 x ptr], align 16
call void @llvm.memset.p0.i64(ptr align 16 %gcframe1, i8 0, i64 32, i1 true)
%0 = getelementptr inbounds nuw i8, ptr %gcframe1, i64 24
%1 = getelementptr inbounds nuw i8, ptr %gcframe1, i64 16
%pgcstack = call ptr inttoptr (i64 4377493260 to ptr)(i64 4377493296) #12
store i64 8, ptr %gcframe1, align 8
%task.gcstack = load ptr, ptr %pgcstack, align 8
%frame.prev = getelementptr inbounds nuw i8, ptr %gcframe1, i64 8
store ptr %task.gcstack, ptr %frame.prev, align 8
store ptr %gcframe1, ptr %pgcstack, align 8
%memoryref_mem = load ptr, ptr %.roots.A, align 8
; @ reduce.jl:253 within `mapreduce_impl`
; ┌ @ promotion.jl:637 within `==`
%.not = icmp eq i64 %"ifirst::Int64", %"ilast::Int64"
; └
br i1 %.not, label %L19, label %L26
L19: ; preds = %top
; @ reduce.jl:254 within `mapreduce_impl`
; ┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%memory_data_ptr = getelementptr inbounds nuw i8, ptr %memoryref_mem, i64 8
%memoryref_data = load ptr, ptr %memory_data_ptr, align 8
%2 = getelementptr double, ptr %memoryref_data, i64 %"ifirst::Int64"
%memoryref_data2 = getelementptr i8, ptr %2, i64 -8
%3 = load double, ptr %memoryref_data2, align 8
; └
; @ reduce.jl:255 within `mapreduce_impl`
br label %common.ret
common.ret: ; preds = %L142, %L95, %middle.block, %L45, %L19
%common.ret.op = phi double [ %3, %L19 ], [ %34, %L142 ], [ %8, %L45 ], [ %25, %middle.block ], [ %28, %L95 ]
%frame.prev96 = load ptr, ptr %frame.prev, align 8
store ptr %frame.prev96, ptr %pgcstack, align 8
; @ reduce.jl within `mapreduce_impl`
ret double %common.ret.op
L26: ; preds = %top
; @ reduce.jl:256 within `mapreduce_impl`
; ┌ @ int.jl:86 within `-`
%4 = sub i64 %"ilast::Int64", %"ifirst::Int64"
; └
; ┌ @ int.jl:83 within `<`
%.not72 = icmp slt i64 %4, %"blksize::Int64"
; └
br i1 %.not72, label %L45, label %L142
L45: ; preds = %L26
; @ reduce.jl:258 within `mapreduce_impl`
; ┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%memory_data_ptr5 = getelementptr inbounds nuw i8, ptr %memoryref_mem, i64 8
%memoryref_data7 = load ptr, ptr %memory_data_ptr5, align 8
%5 = getelementptr double, ptr %memoryref_data7, i64 %"ifirst::Int64"
%memoryref_data12 = getelementptr i8, ptr %5, i64 -8
%6 = load double, ptr %memoryref_data12, align 8
; └
; @ reduce.jl:259 within `mapreduce_impl`
; ┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%7 = load double, ptr %5, align 8
; └
; @ reduce.jl:260 within `mapreduce_impl`
; ┌ @ reduce.jl:19 within `add_sum`
; │┌ @ float.jl:492 within `+`
%8 = fadd double %6, %7
; └└
; @ reduce.jl:261 within `mapreduce_impl`
; ┌ @ simdloop.jl:69 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%9 = add i64 %"ifirst::Int64", 2
; │└
; │┌ @ range.jl:5 within `Colon`
; ││┌ @ range.jl:415 within `UnitRange`
; │││┌ @ range.jl:426 within `unitrange_last`
; ││││┌ @ operators.jl:472 within `>=`
; │││││┌ @ int.jl:522 within `<=`
%.not73 = icmp sgt i64 %9, %"ilast::Int64"
; ││││└└
%10 = add i64 %"ifirst::Int64", 1
%value_phi = select i1 %.not73, i64 %10, i64 %"ilast::Int64"
; │└└└
; │ @ simdloop.jl:71 within `macro expansion`
; │┌ @ simdloop.jl:51 within `simd_inner_length`
; ││┌ @ range.jl:783 within `length`
; │││┌ @ int.jl:86 within `-`
%11 = sub i64 %value_phi, %9
%.not7475 = icmp ult i64 %11, 9223372036854775807
; │└└└
; │ @ simdloop.jl:72 within `macro expansion`
br i1 %.not7475, label %L95.lr.ph, label %common.ret
L95.lr.ph: ; preds = %L45
%12 = getelementptr double, ptr %memoryref_data7, i64 %9
; │ @ simdloop.jl:75 within `macro expansion`
%invariant.gep = getelementptr i8, ptr %12, i64 -8
%13 = xor i64 %"ifirst::Int64", -1
%14 = add i64 %value_phi, %13
%min.iters.check = icmp ult i64 %14, 8
br i1 %min.iters.check, label %L95, label %vector.ph
vector.ph: ; preds = %L95.lr.ph
%n.vec = and i64 %14, -8
%15 = insertelement <2 x double> <double poison, double -0.000000e+00>, double %8, i64 0
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <2 x double> [ %15, %vector.ph ], [ %20, %vector.body ]
%vec.phi78 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %21, %vector.body ]
%vec.phi79 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %22, %vector.body ]
%vec.phi80 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %23, %vector.body ]
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:262
; │┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%16 = getelementptr double, ptr %invariant.gep, i64 %index
%17 = getelementptr i8, ptr %16, i64 16
%18 = getelementptr i8, ptr %16, i64 32
%19 = getelementptr i8, ptr %16, i64 48
%wide.load = load <2 x double>, ptr %16, align 8
%wide.load81 = load <2 x double>, ptr %17, align 8
%wide.load82 = load <2 x double>, ptr %18, align 8
%wide.load83 = load <2 x double>, ptr %19, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:263
; │┌ @ reduce.jl:19 within `add_sum`
; ││┌ @ float.jl:492 within `+`
%20 = fadd reassoc contract <2 x double> %vec.phi, %wide.load
%21 = fadd reassoc contract <2 x double> %vec.phi78, %wide.load81
%22 = fadd reassoc contract <2 x double> %vec.phi79, %wide.load82
%23 = fadd reassoc contract <2 x double> %vec.phi80, %wide.load83
; │└└
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%index.next = add nuw i64 %index, 8
%24 = icmp eq i64 %index.next, %n.vec
br i1 %24, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
; │└└
; │ @ simdloop.jl:75 within `macro expansion`
%bin.rdx = fadd reassoc contract <2 x double> %21, %20
%bin.rdx84 = fadd reassoc contract <2 x double> %22, %bin.rdx
%bin.rdx85 = fadd reassoc contract <2 x double> %23, %bin.rdx84
%25 = call reassoc contract double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx85)
%cmp.n = icmp eq i64 %14, %n.vec
br i1 %cmp.n, label %common.ret, label %L95
L95: ; preds = %L95, %middle.block, %L95.lr.ph
%value_phi2377 = phi i64 [ %26, %L95 ], [ 0, %L95.lr.ph ], [ %n.vec, %middle.block ]
%value_phi2276 = phi double [ %28, %L95 ], [ %8, %L95.lr.ph ], [ %25, %middle.block ]
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%26 = add nuw nsw i64 %value_phi2377, 1
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:262
; │┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%gep = getelementptr double, ptr %invariant.gep, i64 %value_phi2377
%27 = load double, ptr %gep, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:263
; │┌ @ reduce.jl:19 within `add_sum`
; ││┌ @ float.jl:492 within `+`
%28 = fadd reassoc contract double %value_phi2276, %27
; │└└
; │ @ simdloop.jl:75 within `macro expansion`
; │┌ @ int.jl:83 within `<`
%exitcond.not = icmp eq i64 %value_phi2377, %11
; │└
br i1 %exitcond.not, label %common.ret, label %L95
L142: ; preds = %L26
; └
; @ reduce.jl:268 within `mapreduce_impl`
; ┌ @ int.jl:542 within `>>` @ int.jl:535
%29 = ashr i64 %4, 1
; └
; ┌ @ int.jl:87 within `+`
%30 = add i64 %29, %"ifirst::Int64"
; └
; @ reduce.jl:269 within `mapreduce_impl`
store ptr %memoryref_mem, ptr %0, align 8
%31 = call double @julia_mapreduce_impl_7339(ptr nocapture readonly %"A::FixedSizeArray", ptr nocapture nonnull readonly %0, i64 signext %"ifirst::Int64", i64 signext %30, i64 signext %"blksize::Int64")
; @ reduce.jl:270 within `mapreduce_impl`
; ┌ @ int.jl:87 within `+`
%32 = add i64 %30, 1
; └
store ptr %memoryref_mem, ptr %1, align 8
%33 = call double @julia_mapreduce_impl_7339(ptr nocapture readonly %"A::FixedSizeArray", ptr nocapture nonnull readonly %1, i64 signext %32, i64 signext %"ilast::Int64", i64 signext %"blksize::Int64")
; @ reduce.jl:271 within `mapreduce_impl`
; ┌ @ reduce.jl:19 within `add_sum`
; │┌ @ float.jl:492 within `+`
%34 = fadd double %31, %33
; └└
br label %common.ret
}Base.mapreduce_impl is a recursive function to do pairwaise summation, so even a small overhead at the beginning of the generated code for the function scales with the size of the input data.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
performanceMust go fasterMust go faster