From b57f12103f74dbcd3fc1900ae8236dbf33c24cc4 Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <cont-frb@ulund.org>
Date: Thu, 6 Jun 2019 20:34:07 +0200
Subject: [PATCH 1/3] refactor WIP

---
 Manifest.toml                         |  63 +++++-
 Project.toml                          |   2 +
 src/DifferentialDynamicProgramming.jl |   8 +-
 src/backward_pass.jl                  | 293 ++++++++------------------
 src/boxQP.jl                          |  28 ++-
 src/forward_pass.jl                   |  29 +--
 src/iLQG.jl                           |  59 +++---
 src/system_pendcart.jl                |  75 +++----
 test/test_pendcart.jl                 |   2 +-
 test/test_readme.jl                   |  51 ++---
 10 files changed, 285 insertions(+), 325 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 256450d..93aadea 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -3,12 +3,32 @@
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
+[[CSTParser]]
+deps = ["LibGit2", "Test", "Tokenize"]
+git-tree-sha1 = "437c93bc191cd55957b3f8dee7794b6131997c56"
+uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
+version = "0.5.2"
+
+[[Compat]]
+deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
+git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "2.1.0"
+
 [[DataStructures]]
 deps = ["InteractiveUtils", "OrderedCollections", "Random", "Serialization", "Test"]
 git-tree-sha1 = "ca971f03e146cf144a9e2f2ce59674f5bf0e8038"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 version = "0.15.0"
 
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[DelimitedFiles]]
+deps = ["Mmap"]
+uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
@@ -17,6 +37,9 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
+[[LibGit2]]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
 [[Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 
@@ -33,15 +56,24 @@ version = "0.2.1"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
+[[MacroTools]]
+deps = ["CSTParser", "Compat", "DataStructures", "Test"]
+git-tree-sha1 = "daecd9e452f38297c686eba90dba2a6d5da52162"
+uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+version = "0.5.0"
+
 [[Markdown]]
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
-git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
+git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.0.2"
+version = "1.1.0"
 
 [[Parameters]]
 deps = ["Markdown", "OrderedCollections", "REPL", "Test"]
@@ -49,6 +81,10 @@ git-tree-sha1 = "70bdbfb2bceabb15345c0b54be4544813b3444e4"
 uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 version = "0.10.3"
 
+[[Pkg]]
+deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
 [[Printf]]
 deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -73,9 +109,16 @@ git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1"
 uuid = "ae029012-a4dd-5104-9daa-d747884805df"
 version = "0.5.2"
 
+[[SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
+[[SharedArrays]]
+deps = ["Distributed", "Mmap", "Random", "Serialization"]
+uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -83,6 +126,12 @@ uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
+[[StaticArrays]]
+deps = ["LinearAlgebra", "Random", "Statistics"]
+git-tree-sha1 = "db23bbf50064c582b6f2b9b043c8e7e98ea8c0c6"
+uuid = "90137ffa-7385-5640-81b9-e52037218182"
+version = "0.11.0"
+
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -91,6 +140,16 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+[[Tokenize]]
+deps = ["Printf", "Test"]
+git-tree-sha1 = "3e83f60b74911d3042d3550884ca2776386a02b8"
+uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
+version = "0.5.3"
+
+[[UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
diff --git a/Project.toml b/Project.toml
index 52dda97..afb9140 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,9 +7,11 @@ version = "0.4.0"
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearTimeVaryingModelsBase = "5e7a79f3-ee03-5374-bd42-db9a52b6e959"
+MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7"
diff --git a/src/DifferentialDynamicProgramming.jl b/src/DifferentialDynamicProgramming.jl
index f1d4e3c..ed368ec 100644
--- a/src/DifferentialDynamicProgramming.jl
+++ b/src/DifferentialDynamicProgramming.jl
@@ -1,12 +1,14 @@
 module DifferentialDynamicProgramming
-using LinearTimeVaryingModelsBase, Requires, ValueHistories, LinearAlgebra, Statistics, Printf
+using LinearTimeVaryingModelsBase, Requires, ValueHistories, LinearAlgebra, Statistics, Printf, StaticArrays
+using MacroTools: @capture, postwalk, prewalk
 const DEBUG = false # Set this flag to true in order to print debug messages
 
 
-export QPTrace, boxQP, demoQP, iLQG,iLQGkl, demo_linear, demo_linear_kl, demo_pendcart, GaussianPolicy
-
+export QPTrace, boxQP, demoQP, iLQG,iLQGkl, demo_linear, demo_linear_kl, demo_pendcart, GaussianPolicy, tosvec
 
+tosvec(x) = reinterpret(SVector{size(x,1),eltype(x)}, x)[:]
 eye(n) = Matrix{Float64}(I,n,n)
+Base.strides(::SArray{Tuple{i,j}}) where {i,j} = (1,i)
 
 function __init__()
     @require Plots="91a5bcdd-55d7-5caf-9e0b-520d859cae80" begin
diff --git a/src/backward_pass.jl b/src/backward_pass.jl
index f71001d..607d20c 100644
--- a/src/backward_pass.jl
+++ b/src/backward_pass.jl
@@ -1,32 +1,86 @@
 choleskyvectens(a,b) = permutedims(sum(a.*b,1), [3 2 1])
 
-macro setupQTIC()
-    quote
-        m          = size(u,1)
-        n,_,N        = size(fx)
 
-        @assert size(cx) == (n, N) "size(cx) should be (n, N)"
-        @assert size(cu) == (m, N) "size(cu) should be (m, N)"
-        @assert size(cxx) == (n, n) "size(cxx) should be(n, n) "
-        @assert size(cxu) == (n, m) "size(cxu) should be(n, m) "
+getmatindex(x,i) = x[i]
+getmatindex(x::AbstractArray{<:Number},i) = x # In this case the array is constatnt over time
+getmatindex(x::AbstractVector{<:AbstractArray},i) = x[i] # Time varying array
 
-        k          = zeros(m,N)
-        K          = zeros(m,n,N)
-        Vx         = zeros(n,N)
-        Vxx        = zeros(n,n,N)
-        Quu        = Array{T}(undef,m,m,N)
-        Quui       = Array{T}(undef,m,m,N)
-        dV         = [0., 0.]
+macro matviews(ex)
+    ex = prewalk(ex) do ex
+        @capture(ex, a_[i_] = rhs_) && (return :(__protect__($a,$i) = $rhs))
+        @capture(ex, a_[i_] += rhs_) && (return :(__protect__($a,$i) += $rhs))
+        @capture(ex, a_[i_]) && (return :(getmatindex($a,$i)))
+        ex
+    end
+    ex = prewalk(ex) do ex
+        @capture(ex, __protect__(a_,i_) = rhs_) && (return :($a[$i] = $rhs))
+        @capture(ex, __protect__(a_,i_) += rhs_) && (return :($a[$i] += $rhs))
+        ex
+    end
+    esc(ex)
+end
 
-        Vx[:,N]    = cx[:,N]
-        Vxx[:,:,N] = cxx
-        Quu[:,:,N] = cuu
-        diverge    = 0
-    end |> esc
+@macroexpand @matviews begin
+    dV         = dV + [k_i'Qu; .5*k_i'Quu[i]*k_i]
+    Vx[i]    = Qx + K_i'Quu[i]*k_i + K_i'Qu + Qux'k_i
+    Qu  = cu[i]      + fu[i]'Vx[i+1]
+    Qx  = cx[i]      + fx[i]'Vx[i+1]
+    Qux = cxu[i]'  + fu[i]'Vxx[i+1]*fx[i]
+    fxuVx = vectens(Vx[i+1],fxu[i])
+    Qux   = Qux + fxuVx
+    Quu[i] = cuu[i]   + fu[i]'Vxx[i+1]*fu[i]
+    fuuVx = vectens(Vx[i+1],fuu[i])
+    Quu[i] += fuuVx
 end
 
-macro end_backward_pass()
-    quote
+Base.zeros(::Type{Matrix{T}}, n::Int) where T = [zeros(T,0,0) for _ in 1:n]
+Base.zeros(::Type{Vector{T}}, n::Int) where T = [T[] for _ in 1:n]
+
+function back_pass(cx,cu,cxx,cxu,cuu,fx,fu,fxx,fxu,fuu,λ,regType,lims,x,u) # nonlinear time variant
+    m,N        = length(u[1]),length(u)
+    length(cx) == N || throw(ArgumentError("cx must be the same length as u"))
+    @matviews begin
+        n      = length(cx[1])
+        k      = zeros(typeof(cu[1]), N)
+        K      = zeros(typeof(copy(fu[1]')), N)
+        Vx     = zeros(typeof(cx[1]), N)
+        Vxx    = zeros(typeof(fx[1]), N)
+        Quu    = zeros(typeof(cuu[1]), N)
+        Quui   = zeros(typeof(cuu[1]), N)
+        dV     = @SVector [0., 0.]
+        Vx[N]  = cx[N]
+        Vxx[N] = cxx[N]
+        Quu[N] = cuu[N]
+        k[N]   = 0*cu[N]
+        K[N]   = 0*fu[N]'
+    end
+
+    diverge  = 0
+    for i = N-1:-1:1
+        @matviews begin
+            Qu  = cu[i]      + fu[i]'Vx[i+1]
+            Qx  = cx[i]      + fx[i]'Vx[i+1]
+            Qux = cxu[i]'  + fu[i]'Vxx[i+1]*fx[i]
+            if !(fxu === nothing)
+                fxuVx = vectens(Vx[i+1],fxu[i])
+                Qux   = Qux + fxuVx
+            end
+
+            Quu[i] = cuu[i]   + fu[i]'Vxx[i+1]*fu[i]
+            if !(fuu === nothing)
+                fuuVx = vectens(Vx[i+1],fuu[i])
+                Quu[i] += fuuVx
+            end
+
+            Qxx = cxx[i] + fx[i]'Vxx[i+1]*fx[i]
+            fxx === nothing || (Qxx .+= vectens(Vx[i+1],fxx[i]))
+            Vxx_reg = Vxx[i+1] + (regType == 2 ? λ*I : 0*I)
+            Qux_reg = cxu[i]'  + fu[i]'Vxx_reg*fx[i]
+            fxu === nothing || (Qux_reg .+= fxuVx)
+            QuuF = cuu[i]  + fu[i]'Vxx_reg*fu[i] + (regType == 1 ? λ*I : 0*I)
+            fuu === nothing || (QuuF .+= fuuVx)
+        end
+
         QuF = Qu
         if isempty(lims) || lims[1,1] > lims[1,2]
             # debug("#  no control limits: Cholesky decomposition, check for non-PD")
@@ -35,6 +89,7 @@ macro end_backward_pass()
                 R = cholesky(Hermitian(QuuF))
             catch
                 diverge  = i
+                println("Failed chol")
                 return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx, dV
             end
             # debug("#  find control law")
@@ -42,209 +97,41 @@ macro end_backward_pass()
             K_i = -(R\Qux_reg)
         else
             # debug("#  solve Quadratic Program")
-            lower = lims[:,1]-u[:,i]
-            upper = lims[:,2]-u[:,i]
+            lower = lims[:,1]-u[i]
+            upper = lims[:,2]-u[i]
             local k_i,result,free
             try
-                k_i,result,R,free = boxQP(QuuF,QuF,lower,upper,k[:,min(i+1,N-1)])
+                k_i,result,R,free = boxQP(QuuF,QuF,lower,upper,k[min(i+1,N-1)])
             catch
                 result = 0
             end
             if result < 1
                 diverge  = i
+                println("Failed boxQP")
                 return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx, dV
             end
-            K_i  = zeros(m,n)
+            K_i  = similar(K[i])
             if any(free)
                 Lfree         = -R\(R'\Qux_reg[free,:])
                 K_i[free,:]   = Lfree
             end
         end
         # debug("#  update cost-to-go approximation")
-        dV         = dV + [k_i'Qu; .5*k_i'Quu[:,:,i]*k_i]
-        Vx[:,i]    = Qx + K_i'Quu[:,:,i]*k_i + K_i'Qu + Qux'k_i
-        Vxx[:,:,i] = Qxx + K_i'Quu[:,:,i]*K_i + K_i'Qux + Qux'K_i
-        Vxx[:,:,i] = .5*(Vxx[:,:,i] + Vxx[:,:,i]')
-
-        # debug("# save controls/gains")
-        k[:,i]   = k_i
-        K[:,:,i] = K_i
-
-    end |> esc
-end
-
-function back_pass(cx,cu,cxx::AbstractArray{T,3},cxu,cuu,fx::AbstractArray{T,3},fu,fxx,fxu,fuu,λ,regType,lims,x,u) where T # nonlinear time variant
-
-    m,N        = size(u)
-    n          = size(cx,1)
-    @assert size(cx) == (n, N)
-    @assert size(cu) == (m, N)
-    @assert size(cxx) == (n, n, N)
-    @assert size(cxu) == (n, m, N)
-    @assert size(cuu) == (m, m, N)
-    k          = zeros(m,N)
-    K          = zeros(m,n,N)
-    Vx         = zeros(n,N)
-    Vxx        = zeros(n,n,N)
-    Quu        = Array{T}(undef,m,m,N)
-    Quui       = Array{T}(undef,m,m,N)
-    dV         = [0., 0.]
-    Vx[:,N]    = cx[:,N]
-    Vxx[:,:,N] = cxx[:,:,N]
-    Quu[:,:,N] = cuu[:,:,N]
-
-    diverge  = 0
-    for i = N-1:-1:1
-        Qu  = cu[:,i]      + fu[:,:,i]'Vx[:,i+1]
-        Qx  = cx[:,i]      + fx[:,:,i]'Vx[:,i+1]
-        Qux = cxu[:,:,i]'  + fu[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-        if !isempty(fxu)
-            fxuVx = vectens(Vx[:,i+1],fxu[:,:,:,i])
-            Qux   = Qux + fxuVx
-        end
-
-        Quu[:,:,i] = cuu[:,:,i]   + fu[:,:,i]'Vxx[:,:,i+1]*fu[:,:,i]
-        if !isempty(fuu)
-            fuuVx = vectens(Vx[:,i+1],fuu[:,:,:,i])
-            Quu[:,:,i] .+= fuuVx
-        end
-
-        Qxx = cxx[:,:,i] + fx[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-        isempty(fxx) || (Qxx .+= vectens(Vx[:,i+1],fxx[:,:,:,i]))
-        Vxx_reg = Vxx[:,:,i+1] .+ (regType == 2 ? λ*eye(n) : 0)
-        Qux_reg = cxu[:,:,i]'  + fu[:,:,i]'Vxx_reg*fx[:,:,i]
-        isempty(fxu) || (Qux_reg .+= fxuVx)
-        QuuF = cuu[:,:,i]  + fu[:,:,i]'Vxx_reg*fu[:,:,i] .+ (regType == 1 ? λ*eye(m) : 0)
-        isempty(fuu) || (QuuF .+= fuuVx)
-
-        @end_backward_pass
-    end
-
-    return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx,dV
-end
-
-
-function back_pass(cx,cu,cxx::AbstractArray{T,2},cxu,cuu,fx::AbstractArray{T,3},fu,fxx,fxu,fuu,λ,regType,lims,x,u) where T # quadratic timeinvariant cost, dynamics nonlinear time variant
-
-    @setupQTIC
-
-    for i = N-1:-1:1
-        Qu  = cu[:,i]   + fu[:,:,i]'Vx[:,i+1]
-        Qx  = cx[:,i]   + fx[:,:,i]'Vx[:,i+1]
-        Qux = cxu' + fu[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-        if !isempty(fxu)
-            fxuVx = vectens(Vx[:,i+1],fxu[:,:,:,i])
-            Qux   = Qux + fxuVx
+        @matviews begin
+            dV    += [k_i'Qu; .5*k_i'Quu[i]*k_i]
+            Vx[i]  = Qx + K_i'Quu[i]*k_i + K_i'Qu + Qux'k_i
+            Vxx[i] = Qxx + K_i'Quu[i]*K_i + K_i'Qux + Qux'K_i
+            Vxx[i] = .5*(Vxx[i] + Vxx[i]')
         end
 
-        Quu[:,:,i] = cuu + fu[:,:,i]'Vxx[:,:,i+1]*fu[:,:,i]
-        if !isempty(fuu)
-            fuuVx = vectens(Vx[:,i+1],fuu[:,:,:,i])
-            Quu[:,:,i]   = Quu[:,:,i] + fuuVx
-        end
-        Qxx = cxx  + fx[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-        isempty(fxx) || (Qxx .+= vectens(Vx[:,i+1],fxx[:,:,:,i]))
-        Vxx_reg = Vxx[:,:,i+1] .+ (regType == 2 ? λ*eye(n) : 0)
-        Qux_reg = cxu'   + fu[:,:,i]'Vxx_reg*fx[:,:,i]
-        isempty(fxu) || (Qux_reg .+= fxuVx)
-        QuuF = cuu  + fu[:,:,i]'Vxx_reg*fu[:,:,i] .+ (regType == 1 ? λ*eye(m) : 0)
-        isempty(fuu) || (QuuF .+= fuuVx)
-        @end_backward_pass
-    end
-    return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx,dV
-end
-
-function back_pass(cx,cu,cxx::AbstractArray{T,2},cxu,cuu,fx::AbstractArray{T,3},fu,λ,regType,lims,x,u) where T # quadratic timeinvariant cost, linear time variant dynamics
-    @setupQTIC
-    for i = N-1:-1:1
-        Qu         = cu[:,i] + fu[:,:,i]'Vx[:,i+1]
-        Qx         = cx[:,i] + fx[:,:,i]'Vx[:,i+1]
-        Qux        = cxu' + fu[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-        Quu[:,:,i] = cuu + fu[:,:,i]'Vxx[:,:,i+1]*fu[:,:,i]
-        Qxx        = cxx + fx[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-        Vxx_reg    = Vxx[:,:,i+1] .+ (regType == 2 ? λ*eye(n) : 0)
-        Qux_reg    = cxu' + fu[:,:,i]'Vxx_reg*fx[:,:,i]
-        QuuF       = cuu + fu[:,:,i]'Vxx_reg*fu[:,:,i] .+ (regType == 1 ? λ*eye(m) : 0)
-        @end_backward_pass
-    end
-
-    return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx,dV
-end
-
-function back_pass(cx,cu,cxx::AbstractArray{T,3},cxu,cuu,fx::AbstractArray{T,3},fu,λ,regType,lims,x,u) where T # quadratic timeVariant cost, linear time variant dynamics
-    m          = size(u,1)
-    n,N        = size(fx,1,3)
-
-    @assert size(cx) == (n, N)
-    @assert size(cu) == (m, N)
-    @assert size(cxx) == (n, n, N)
-    @assert size(cxu) == (n, m, N)
-    @assert size(cuu) == (m, m, N)
-
-    k          = zeros(m,N)
-    K          = zeros(m,n,N)
-    Vx         = zeros(n,N)
-    Vxx        = zeros(n,n,N)
-    Quu        = Array{T}(undef,m,m,N)
-    Quui       = Array{T}(undef,m,m,N)
-    dV         = [0., 0.]
-
-    Vx[:,N]    = cx[:,N]
-    Vxx[:,:,N] = cxx[:,:,end]
-    Quu[:,:,N] = cuu[:,:,N]
-
-    diverge    = 0
-    for i = N-1:-1:1
-        Qu          = cu[:,i] + fu[:,:,i]'Vx[:,i+1]
-        Qx          = cx[:,i] + fx[:,:,i]'Vx[:,i+1]
-        Vxx_reg     = Vxx[:,:,i+1] .+ (regType == 2 ? λ*eye(n) : 0)
-        Qux_reg     = cxu[:,:,i]' + fu[:,:,i]'Vxx_reg*fx[:,:,i]
-        QuuF        = cuu[:,:,i]  + fu[:,:,i]'Vxx_reg*fu[:,:,i] .+ (regType == 1 ? λ*eye(m) : 0)
-        Qux         = cxu[:,:,i]' + fu[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-        Quu[:,:,i] .= cuu[:,:,i] .+ fu[:,:,i]'Vxx[:,:,i+1]*fu[:,:,i]
-        Qxx         = cxx[:,:,i]  + fx[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-        @end_backward_pass
+        # debug("# save controls/gains")
+        k[i] = k_i
+        K[i] = K_i
     end
 
     return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx,dV
 end
 
-function back_pass(cx,cu,cxx::AbstractArray{T,2},cxu,cuu,fx::AbstractMatrix{T},fu,λ,regType,lims,x,u) where T # cost quadratic and cost and LTI dynamics
-
-    m,N = size(u)
-    n   = size(fx,1)
-    @assert size(cx) == (n, N)
-    @assert size(cu) == (m, N)
-    @assert size(cxx) == (n, n)
-    @assert size(cxu) == (n, m)
-    @assert size(cuu) == (m, m)
-    k   = zeros(m,N)
-    K   = zeros(m,n,N)
-    Vx  = zeros(n,N)
-    Vxx = zeros(n,n,N)
-    Quu = Array{T}(undef,m,m,N)
-    Quui = Array{T}(undef,m,m,N)
-    dV  = [0., 0.]
-
-    Vx[:,N]    = cx[:,N]
-    Vxx[:,:,N] = cxx
-    Quu[:,:,N] = cuu
-
-    diverge    = 0
-    for i = N-1:-1:1
-        Qu         = cu[:,i] + fu'Vx[:,i+1]
-        Qx         = cx[:,i] + fx'Vx[:,i+1]
-        Qux        = cxu' + fu'Vxx[:,:,i+1]*fx
-        Quu[:,:,i] = cuu + fu'Vxx[:,:,i+1]*fu
-        Qxx        = cxx + fx'Vxx[:,:,i+1]*fx
-        Vxx_reg    = Vxx[:,:,i+1] .+ (regType == 2 ? λ*eye(n) : 0)
-        Qux_reg    = cxu' + fu'Vxx_reg*fx
-        QuuF       = cuu + fu'Vxx_reg*fu .+ (regType == 1 ? λ*eye(m) : 0)
-        @end_backward_pass
-    end
-
-    return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx,dV
-end
 
 
 function graphics(x...)
diff --git a/src/boxQP.jl b/src/boxQP.jl
index 07c9f59..285b334 100644
--- a/src/boxQP.jl
+++ b/src/boxQP.jl
@@ -70,9 +70,8 @@ function boxQP(H,g,lower,upper,x0::AbstractVector;
     iter = 1
     while iter <= maxIter
 
-        if result != 0
-            break
-        end
+        result == 0 || break
+
 
         # debug("# check relative improvement")
         if iter>1 && (oldvalue - value) < minRelImprove*abs(oldvalue)
@@ -90,7 +89,7 @@ function boxQP(H,g,lower,upper,x0::AbstractVector;
         #         clamped[(x[:,1] .== lower)&(grad[:,1].>0)]   = true
         #         clamped[(x[:,1] .== upper)&(grad[:,1].<0)]   = true
         for i = 1:n
-            clamped[i]   = ((x[i,1] == lower[i])&&(grad[i,1]>0)) || ((x[i,1] == upper[i])&&(grad[i,1]<0))
+            clamped[i]   = ((x[i] == lower[i]) && (grad[i]>0)) || ((x[i] == upper[i]) && (grad[i]<0))
         end
         free = .!clamped
 
@@ -104,7 +103,7 @@ function boxQP(H,g,lower,upper,x0::AbstractVector;
         if iter == 1
             factorize = true
         else
-            factorize = any(old_clamped != clamped)
+            factorize = any(old_clamped .!= clamped)
         end
 
         if factorize
@@ -125,7 +124,7 @@ function boxQP(H,g,lower,upper,x0::AbstractVector;
 
         # debug("# get search direction")
         grad_clamped   = g  + H*(x.*clamped)
-        search         = zeros(n,1)
+        search         = zeros(n)
         search[free]   = -Hfree\(Hfree'\grad_clamped[free]) - x[free]
 
         # debug("# check for descent direction")
@@ -138,13 +137,13 @@ function boxQP(H,g,lower,upper,x0::AbstractVector;
         step  = 1
         nstep = 0
         xc    = clamp.(x+step*search,lower,upper)
-        vc    = (xc'*g + 0.5*xc'*H*xc)[1]
+        vc    = (xc'g + 0.5*xc'H*xc)[1]
         while (vc - oldvalue)/(step*sdotg) < Armijo
             step  = step*stepDec
             nstep += 1
             xc    = clamp.(x+step*search,lower,upper)
-            vc    = (xc'*g + 0.5*xc'*H*xc)[1]
-            if step<minStep
+            vc    = (xc'g + 0.5*xc'H*xc)[1]
+            if step < minStep
                 result = 2
                 break
             end
@@ -197,3 +196,14 @@ function demoQP(;kwargs...)
     @time boxQP(H, g, lower, upper, randn(n);print=1, kwargs...)
 
 end
+
+function demoQPstatic(;kwargs...)
+    n 		= 2
+    g 		= SVector{n,Float64}(randn(n))
+    H 		= SMatrix{n,n,Float64}(randn(n,n))
+    H 		= H*H'
+    lower 	= -ones(n)
+    upper 	=  ones(n)
+    @time boxQP(H, g, lower, upper, SVector{n,Float64}(randn(n));print=1, kwargs...)
+
+end
diff --git a/src/forward_pass.jl b/src/forward_pass.jl
index 82d82ee..80b8ca8 100644
--- a/src/forward_pass.jl
+++ b/src/forward_pass.jl
@@ -8,23 +8,24 @@
 """
 function forward_pass(traj_new, x0,u,x,α,f,costfun,lims,diff)
     n         = size(x0,1)
-    m,N       = size(u)
-    xnew      = Array{eltype(x0)}(undef,n,N)
-    xnew[:,1] = x0
+    N         = length(u)
+    m         = length(u[1])
+    xnew      = eltype(u) <: StaticVector ? [@SVector zeros(n) for _ in 1:N] : [Float64[] for _ in 1:N]
+    xnew[1]   = x0
     unew      = copy(u)
     cnew      = zeros(N)
     for i = 1:N
         if !isempty(traj_new)
-            unew[:,i] .+= traj_new.k[:,i]*α
-            dx = diff(xnew[:,i], x[:,i])
-            unew[:,i] .+= traj_new.K[:,:,i]*dx
+            unew[i] += traj_new.k[i]*α
+            dx = diff(xnew[i], x[i])
+            unew[i] += traj_new.K[i]*dx
         end
         if !isempty(lims)
-            unew[:,i] = clamp.(unew[:,i],lims[:,1], lims[:,2])
+            unew[i] = clamp.(unew[i],lims[:,1], lims[:,2])
         end
-        xnewi  = f(xnew[:,i], unew[:,i], i)
+        xnewi  = f(xnew[i], unew[i], i)
         if i < N
-            xnew[:,i+1] = xnewi
+            xnew[i+1] = xnewi
         end
     end
     cnew = costfun(xnew, unew)
@@ -36,9 +37,9 @@ end
 
 function forward_covariance(model, x, u, traj)
     fx,fu,fxx,fxu,fuu = df(model, x, u)
-    n        = size(fx,1)
-    m        = size(fu,2)
-    N        = size(fx,3)
+    n        = size(fx[1],1)
+    m        = size(fu[1],2)
+    N        = length(fx)
     R1       = covariance(model,x,u) # Simple empirical prediction covariance
     Σ0       = R1 # TODO: I was lazy here
     ix = 1:n
@@ -46,8 +47,8 @@ function forward_covariance(model, x, u, traj)
     sigmanew = Array{Float64}(undef,n+m,n+m,N)
     sigmanew[ix,ix,1] = Σ0
     for i = 1:N-1
-        K,Σ = traj.K[:,:,i], traj.Σ[:,:,i]
-        sigmanew[ix,ix,i+1] = fx[:,:,i]*sigmanew[ix,ix,i]*fx[:,:,i]' + R1 # Iterate dLyap forward
+        K,Σ = traj.K[i], traj.Σ[:,:,i]
+        sigmanew[ix,ix,i+1] = fx[i]*sigmanew[ix,ix,i]*fx[i]' + R1 # Iterate dLyap forward
         sigmanew[iu,ix,i] = K*sigmanew[ix,ix,i]
         sigmanew[ix,iu,i] = sigmanew[ix,ix,i]*K'
         sigmanew[iu,iu,i] = K*sigmanew[ix,ix,i]*K' + Σ
diff --git a/src/iLQG.jl b/src/iLQG.jl
index f7388a5..dc9bb97 100644
--- a/src/iLQG.jl
+++ b/src/iLQG.jl
@@ -1,8 +1,8 @@
 import Base: length
-EmptyMat3 = Array{Float64}(undef, 0,0,0)
-EmptyMat2 = Array{Float64}(undef, 0,0)
-emptyMat3(P) = Array{P}(undef, 0,0,0)
-emptyMat2(P) = Array{P}(undef, 0,0)
+EmptyMat3 = [Array{Float64}(undef, 0,0)]
+EmptyMat2 = [Array{Float64}(undef, 0)]
+emptyMat3(P) = [Array{P}(undef, 0,0)]
+emptyMat2(P) = [Array{P}(undef, 0)]
 mutable struct Trace
     iter::Int64
     λ::Float64
@@ -30,25 +30,27 @@ end
 T::Int          # number of time steps
 n::Int          # State dimension
 m::Int          # Number of control inputs
-K::Array{P,3}   # Time-varying feedback gain ∈ R(n,m,T)
-k::Array{P,2}   # Open loop control signal  ∈ R(m,T)
-Σ::Array{P,3}   # Time-varying controller covariance  ∈ R(m,m,T)
-Σi::Array{P,3}  # The inverses of Σ
+K::Vector{<:AbstractMatrix}   # Time-varying feedback gain ∈ R(n,m;T)
+k::Vector{<:AbstractVector}   # Open loop control signal  ∈ R(m;T)
+Σ::Vector{<:AbstractMatrix}   # Time-varying controller covariance  ∈ R(m,m;T)
+Σi::Vector{<:AbstractMatrix}  # The inverses of Σ
 ```
 """
-mutable struct GaussianPolicy{P}
+mutable struct GaussianPolicy{MT1<:AbstractMatrix,MT2<:AbstractMatrix,VT<:AbstractVector}
     T::Int
     n::Int
     m::Int
-    K::Array{P,3}
-    k::Array{P,2}
-    Σ::Array{P,3}
-    Σi::Array{P,3}
+    K::Vector{MT1}
+    k::Vector{VT}
+    Σ::Vector{MT2}
+    Σi::Vector{MT2}
 end
 
 eye(P,n) = Matrix{P}(I,n,n)
 GaussianPolicy(P) = GaussianPolicy(0,0,0,emptyMat3(P),emptyMat2(P),emptyMat3(P),emptyMat3(P))
-GaussianPolicy(P,T,n,m) = GaussianPolicy(T,n,m,zeros(P,m,n,T),zeros(P,m,T),cat([eye(P,m) for t=1:T]..., dims=3),cat([eye(P,m) for t=1:T]..., dims=3))
+# GaussianPolicy(P,T,n,m) = GaussianPolicy(T,n,m,zeros(P,m,n,T),zeros(P,m,T),cat([eye(P,m) for t=1:T]..., dims=3),cat([eye(P,m) for t=1:T]..., dims=3))
+
+GaussianPolicy(P,T,n,m) = GaussianPolicy(0,0,0,zeros(SMatrix{m,n,P},T),zeros(SVector{m,P},T),zeros(SMatrix{m,m,P},T),zeros(SMatrix{m,m,P},T))
 Base.isempty(gp::GaussianPolicy) = gp.T == gp.n == gp.m == 0
 Base.length(gp::GaussianPolicy) = gp.T
 
@@ -162,10 +164,10 @@ function iLQG(f,costfun,df, x0, u0;
     local fx,fu,fxx,fxu,fuu,cx,cu,cxx,cxu,cuu,xnew,unew,costnew,g_norm,Vx,Vxx,dV,αi
     # --- initial sizes and controls
     n   = size(x0, 1)          # dimension of state vector
-    m   = size(u0, 1)          # dimension of control vector
-    N   = size(u0, 2)          # number of state transitions
+    m   = size(u0[1], 1)          # dimension of control vector
+    N   = length(u0)          # number of state transitions
     u   = u0                   # initial control sequence
-    traj_new  = GaussianPolicy(Float64)
+    traj_new  = GaussianPolicy(Float64,N,n,m)
     # traj_prev = GaussianDist(Float64)
 
     # --- initialize trace data structure
@@ -175,19 +177,20 @@ function iLQG(f,costfun,df, x0, u0;
 
     # --- initial trajectory
     debug("Setting up initial trajectory")
-    if size(x0,2) == 1 # only initial state provided
+    if x0 isa AbstractVector # only initial state provided
         diverge = true
         for outer αi ∈ α
             debug("# test different backtracing parameters α and break loop when first succeeds")
-            x,un,cost, = forward_pass(traj_new,x0[:,1],αi*u,[],1,f,costfun, lims,diff_fun)
+            x,un,cost, = forward_pass(traj_new,x0,αi.*u,[],1,f,costfun, lims,diff_fun)
             debug("# simplistic divergence test")
-            if all(abs.(x) .< 1e8)
+            @show typeof(x)
+            if all(all(abs.(x) .< 1e8) for x in x)
                 u = un
                 diverge = false
                 break
             end
         end
-    elseif size(x0,2) == N
+    elseif length(x0) == N
         debug("# pre-rolled initial forward pass, initial traj provided")
         x        = x0
         diverge  = false
@@ -226,16 +229,12 @@ function iLQG(f,costfun,df, x0, u0;
             flg_change   = false
         end
         # Determine what kind of system we are dealing with
-        linearsys = isempty(fxx) && isempty(fxu) && isempty(fuu); debug("linear system: $linearsys")
+        linearsys = fxx === nothing && fxu === nothing && fuu === nothing; debug("linear system: $linearsys")
 
         # ====== STEP 2: backward pass, compute optimal control law and cost-to-go
         back_pass_done = false
         while !back_pass_done
-            _t = @elapsed diverge, traj_new,Vx, Vxx,dV = if linearsys
-                back_pass(cx,cu,cxx,cxu,cuu,fx,fu,λ, regType, lims,x,u)
-            else
-                back_pass(cx,cu,cxx,cxu,cuu,fx,fu,fxx,fxu,fuu,λ, regType, lims,x,u)
-            end
+            _t = @elapsed diverge, traj_new,Vx, Vxx,dV = back_pass(cx,cu,cxx,cxu,cuu,fx,fu,fxx,fxu,fuu,λ, regType, lims,x,u)
             increment!(trace, :time_backward, iter, _t)
             iter == 1 && (traj_prev = traj_new) # TODO: set k μu to zero fir traj_prev
 
@@ -251,7 +250,9 @@ function iLQG(f,costfun,df, x0, u0;
 
         k, K = traj_new.k, traj_new.K
         #  check for termination due to small gradient
-        g_norm = mean(maximum(abs.(k) ./ (abs.(u) .+ 1), dims=1))
+        g_norm = mean(zip(k,u)) do (k,u)
+            maximum(abs.(k) ./ (abs.(u) .+ 1))
+        end
         trace(:grad_norm, iter, g_norm)
         if g_norm <  tol_grad && λ < 1e-5 && satisfied
             verbosity > 0 && @printf("\nSUCCESS: gradient norm < tol_grad\n")
@@ -263,7 +264,7 @@ function iLQG(f,costfun,df, x0, u0;
         if back_pass_done
             debug("#  serial backtracking line-search")
             @elapsed(for outer αi = α
-                xnew,unew,costnew = forward_pass(traj_new, x0[:,1] ,u, x,αi,f,costfun, lims, diff_fun)
+                xnew,unew,costnew = forward_pass(traj_new, x0 ,u, x,αi,f,costfun, lims, diff_fun)
                 Δcost    = sum(cost) - sum(costnew)
                 expected_reduction = -αi*(dV[1] + αi*dV[2])
                 reduce_ratio = if expected_reduction > 0
diff --git a/src/system_pendcart.jl b/src/system_pendcart.jl
index 4ae4d49..8298d63 100644
--- a/src/system_pendcart.jl
+++ b/src/system_pendcart.jl
@@ -90,94 +90,89 @@ function demo_pendcart(;x0 = [π-0.6,0,0,0], goal = [π,0,0,0],
 
 
     function cost_quadratic(x,u)
-        local d = (x.-goal)
+        local d = (x .- Ref(goal))
         0.5(d'*Q*d + u'R*u)[1]
     end
 
     function cost_quadratic(x::Matrix,u)
         local d = (x.-goal)
-        T = size(u,2)
+        T = length(u)
         c = Vector{Float64}(undef,T+1)
         for t = 1:T
-            c[t] = 0.5(d[:,t]'*Q*d[:,t] + u[:,t]'R*u[:,t])[1]
+            c[t] = 0.5(d[t]'*Q*d[t] + u[t]'R*u[t])[1]
         end
-        c[end] = cost_quadratic(x[:,end][:],[0.0])
+        c[end] = cost_quadratic(x[end],[0.0])
         return c
     end
 
-    cx = zeros(4,T)
-    cu = zeros(1,T)
-    cxu = zeros(D,1)
 
     function dcost_quadratic(x,u)
-        cx  .= Q*(x.-goal)
-        cu  .= R.*u
-        return cx,cu,cxu
+        cx  = Ref(Q) .* (x .- Ref(goal))
+        cu  = Ref(R) .*u
+        return cx,cu,zeros(D)
     end
 
 
     function lin_dyn_f(x,u,i)
-        u[isnan.(u)] .= 0
         f = dfsys(x,u)
     end
 
 
-    fxc           = Array{Float64}(undef,D,D,I)
-    fuc           = Array{Float64}(undef,D,1,I)
-    fxd           = Array{Float64}(undef,D,D,I)
-    fud           = Array{Float64}(undef,D,1,I)
-    for ii = 1:I
-        fxc[:,:,ii] = [0 1 0 0;
+
+        fxc = Float64[0 1 0 0;
         0 0 0 0;
         0 0 0 1;
         0 0 0 0]
-        fuc[:,:,ii] = [0, 0, 0, 1]
-    end
+        fuc = reshape(Float64[0, 0, 0, 1], 4,1)
+        fxc = [copy(fxc) for i = 1:N]
+        fuc = [copy(fuc) for i = 1:N]
+        fxd = deepcopy(fxc)
+        fud = deepcopy(fuc)
+
 
     function lin_dyn_df(x,u)
-        u[isnan.(u)] .= 0
-        D            = size(x,1)
-        nu,I         = size(u)
+        D            = length(x[1])
+        nu,I         = length(u[1]),length(u)
         cx,cu,cxu    = dcost_quadratic(x,u)
         cxx          = Q
-        cuu          = [R]
+        cuu          = fill(R,1,1)
         for ii = 1:I
-            fxc[2,1,ii] = -g/l*cos(x[1,ii])-u[ii]/l*sin(x[1,ii])
-            fxc[2,2,ii] = -d
-            fuc[2,1,ii] = cos(x[1,ii])/l
-            ABd = exp([fxc[:,:,ii]*h  fuc[:,:,ii]*h; zeros(nu, D + nu)])# ZoH sampling
-            fxd[:,:,ii] = ABd[1:D,1:D]
-            fud[:,:,ii] = ABd[1:D,D+1:D+nu]
+            fxc[ii][2,1] = -g/l*cos(x[ii][1])-u[ii][]/l*sin(x[ii][1])
+            fxc[ii][2,2] = -d
+            fuc[ii][2,1] = cos(x[ii][1])/l
+            ABd = exp([fxc[ii]*h  fuc[ii]*h; zeros(nu, D + nu)])# ZoH sampling
+            fxd[ii] = ABd[1:D,1:D]
+            fud[ii] = ABd[1:D,D+1:D+nu]
         end
-        fxx=fxu=fuu = []
+        fxx=fxu=fuu = nothing
         return fxd,fud,fxx,fxu,fuu,cx,cu,cxx,cxu,cuu
     end
 
-    x = zeros(4,N)
-    u = zeros(1,T)
+    x = [zeros(4) for _ in 1:N]
+    u = [zeros(1) for _ in 1:T]
 
     """
     Simulate a pendulum on a cart using the non-linear equations
     """
     function simulate_pendcart(x0,L, dfsys, cost)
-        x[:,1] = x0
-        u[1] = 0
+        x[1] = x0
+        u[1] .= 0
         for t = 2:T
-            dx     = copy(x[:,t-1])
+            dx     = copy(x[t-1])
             dx[1] -= pi
-            u[t]   = -(L*dx)[1]
+            u[t]   .= -(L*dx)
             if !isempty(lims)
-                u[t]   = clamp(u[t],lims[1],lims[2])
+                clamp!(u[t],lims[1],lims[2])
             end
-            x[:,t] = dfsys(x[:,t-1],u[t])
+            x[t] = dfsys(x[t-1],u[t])
         end
-        dx      = copy(x[:,T])
+        dx      = copy(x[T])
         dx[1]  -= pi
         uT      = -(L*dx)[1]
         if !isempty(lims)
             uT   = clamp(uT,lims[1],lims[2])
         end
-        x[:,T+1] = dfsys(x[:,T],uT)
+        x[T+1] = dfsys(x[T],uT)
         c = cost(x,u)
 
         return x, u, c
diff --git a/test/test_pendcart.jl b/test/test_pendcart.jl
index 4c02ebb..4b986af 100644
--- a/test/test_pendcart.jl
+++ b/test/test_pendcart.jl
@@ -123,7 +123,7 @@ g/l 0 0 0;
 0 0 0 1;
 0 0 0 0]
 B   = [0, -1/l, 0, 1]
-C   = eye(4) # Assume all states are measurable
+C   = Matrix{Float64}(I,4,4) # Assume all states are measurable
 D   = 4
 
 sys = ss(A,B,C,zeros(4))
diff --git a/test/test_readme.jl b/test/test_readme.jl
index 439c3bb..576c58b 100644
--- a/test/test_readme.jl
+++ b/test/test_readme.jl
@@ -1,52 +1,50 @@
-using Test, Statistics, LinearAlgebra, Random
+using Test, Statistics, LinearAlgebra, Random, DifferentialDynamicProgramming
 # make stable linear dynamics
 Random.seed!(0)
 eye = DifferentialDynamicProgramming.eye
-costs = map(1:10) do MCiteration
-    h    = .01  # time step
-    n    = 10   # state dimension
-    m    = 2    # control dimension
-    A    = randn(n,n)
+h    = .01  # time step
+n    = 10   # state dimension
+m    = 2    # control dimension
+# control limits
+lims = []# ones(m,1)*[-1 1]*.6
+
+T    = 1000             # horizon
+# quadratic costs
+const Q    = SMatrix{n,n,Float64}(h*eye(n))
+const R    = SMatrix{m,m,Float64}(.1*h*eye(m))
+
+@time costs = map(1:10) do MCiteration
+    A    = @SMatrix randn(n,n)
     A    = A-A' # skew-symmetric = pure imaginary eigenvalues
     A    = exp(h*A)        # discrete time
-    B    = h*randn(n,m)
-
-    # quadratic costs
-    Q    = h*eye(n)
-    R    = .1*h*eye(m)
+    B    = h* @SMatrix(randn(n,m))
 
-    # control limits
-    lims = []# ones(m,1)*[-1 1]*.6
 
-    T    = 1000             # horizon
-    x0   = ones(n,1)        # initial state
-    u0   = .1*randn(m,T)    # initial controls
+    x0   = ones(n)        # initial state
+    u0   = .1*randn(m,T) |> tosvec    # initial controls
 
     # optimization problem
     N    = T+1
     fx   = A
     fu   = B
     cxx  = Q
-    cxu  = zeros(size(B))
+    cxu  = 0*B
     cuu  = R
 
     # Specify dynamics functions
     function lin_dyn_df(x,u,Q,R)
-        u[isnan.(u)] .= 0
-        cx  = Q*x
-        cu  = R*u
-        fxx=fxu=fuu = []
+        cx  = Ref(Q) .* x
+        cu  = Ref(R) .* u
+        fxx=fxu=fuu = nothing
         return fx,fu,fxx,fxu,fuu,cx,cu,cxx,cxu,cuu
     end
     function lin_dyn_f(x,u,A,B)
-        u[isnan.(u)] .= 0
         xnew = A*x + B*u
         return xnew
     end
 
     function lin_dyn_cost(x,u,Q)
-        c = 0.5*sum(x.*(Q*x)) + 0.5*sum(u.*(R*u))
-        return c
+        0.5*sum(x'Q*x + u'R*u for (x,u) in zip(x,u))
     end
 
     f(x,u,i)     = lin_dyn_f(x,u,A,B)
@@ -68,3 +66,8 @@ end
 @test maximum(costs) < 25 # This should be the case most of the times
 @test mean(costs) < 10 # This should be the case most of the times
 @test minimum(costs) < 5 # This should be the case most of the times
+
+
+
+# 8.136281 seconds (13.75 M allocations: 2.661 GiB, 13.57% gc time)
+# 2.174562 seconds (2.90 M allocations: 157.990 MiB, 5.06% gc time)

From 568ddb4f6829195f426e23127dbe7a81fa77e43d Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <cont-frb@ulund.org>
Date: Thu, 6 Jun 2019 21:31:38 +0200
Subject: [PATCH 2/3] more WIP

---
 src/DifferentialDynamicProgramming.jl | 10 +++---
 src/backward_pass.jl                  | 41 ++++++++-------------
 src/iLQG.jl                           |  1 -
 src/system_pendcart.jl                | 52 +++++++++++++--------------
 4 files changed, 46 insertions(+), 58 deletions(-)

diff --git a/src/DifferentialDynamicProgramming.jl b/src/DifferentialDynamicProgramming.jl
index ed368ec..1a193ad 100644
--- a/src/DifferentialDynamicProgramming.jl
+++ b/src/DifferentialDynamicProgramming.jl
@@ -15,19 +15,19 @@ function __init__()
     @eval LinearAlgebra.adjoint(x::String) = x
     @eval function plotstuff_linear(x,u,cost,totalcost)
         p = Plots.plot(layout=(2,2))
-        Plots.plot!(p,x', title="State Trajectories", xlabel="Time step",legend=false, subplot=1, show=false)
+        Plots.plot!(p,reduce(hcat,x)', title="State Trajectories", xlabel="Time step",legend=false, subplot=1, show=false)
         Plots.plot!(p,cost,c=:black,linewidth=3, title="Cost", xlabel="Time step", subplot=2, show=false)
-        Plots.plot!(p,u',title="Control signals", xlabel="Time step", subplot=3, show=false)
+        Plots.plot!(p,reduce(hcat,u)',title="Control signals", xlabel="Time step", subplot=3, show=false)
         Plots.plot!(p,totalcost,title="Total cost", xlabel="Iteration", subplot=4, show=false)
         Plots.gui()
     end
     @eval function plotstuff_pendcart(x00, u00, x,u,cost00,cost,otrace)
         cp = Plots.plot(layout=(1,3))
-        sp = Plots.plot(x00',title=["\$x_$(i)\$" for i=1:size(x00,1)]', lab="Simulation", layout=(2,2))
-        Plots.plot!(sp,x', title=["\$x_$(i)\$" for i=1:size(x00,1)]', lab="Optimized", xlabel="Time step", legend=true)
+        sp = Plots.plot(reduce(hcat,x00)',title=["\$x_$(i)\$" for i=1:length(x00[1])]', lab="Simulation", layout=(2,2))
+        Plots.plot!(sp,reduce(hcat,x)', title=["\$x_$(i)\$" for i=1:length(x00[1])]', lab="Optimized", xlabel="Time step", legend=true)
 
         Plots.plot!(cp,cost00, title="Cost", lab="Simulation", subplot=2)
-        Plots.plot!(cp,u', legend=true, title="Control signal",lab="Optimized", subplot=1)
+        Plots.plot!(cp,reduce(hcat,u)', legend=true, title="Control signal",lab="Optimized", subplot=1)
         Plots.plot!(cp,cost[2:end], legend=true, title="Cost",lab="Optimized", xlabel="Time step", subplot=2, yscale=:log10)
         iters = sum(cost .> 0)
         filter!(x->x>0,cost)
diff --git a/src/backward_pass.jl b/src/backward_pass.jl
index 607d20c..4ee5955 100644
--- a/src/backward_pass.jl
+++ b/src/backward_pass.jl
@@ -20,33 +20,21 @@ macro matviews(ex)
     esc(ex)
 end
 
-@macroexpand @matviews begin
-    dV         = dV + [k_i'Qu; .5*k_i'Quu[i]*k_i]
-    Vx[i]    = Qx + K_i'Quu[i]*k_i + K_i'Qu + Qux'k_i
-    Qu  = cu[i]      + fu[i]'Vx[i+1]
-    Qx  = cx[i]      + fx[i]'Vx[i+1]
-    Qux = cxu[i]'  + fu[i]'Vxx[i+1]*fx[i]
-    fxuVx = vectens(Vx[i+1],fxu[i])
-    Qux   = Qux + fxuVx
-    Quu[i] = cuu[i]   + fu[i]'Vxx[i+1]*fu[i]
-    fuuVx = vectens(Vx[i+1],fuu[i])
-    Quu[i] += fuuVx
-end
 
-Base.zeros(::Type{Matrix{T}}, n::Int) where T = [zeros(T,0,0) for _ in 1:n]
-Base.zeros(::Type{Vector{T}}, n::Int) where T = [T[] for _ in 1:n]
+Base.zeros(x::AbstractVecOrMat{T}, n::Int) where T = [zeros(T,size(x)) for _ in 1:n]
+Base.zeros(x::StaticArray, n::Int) = zeros(typeof(x), n)
 
 function back_pass(cx,cu,cxx,cxu,cuu,fx,fu,fxx,fxu,fuu,λ,regType,lims,x,u) # nonlinear time variant
     m,N        = length(u[1]),length(u)
     length(cx) == N || throw(ArgumentError("cx must be the same length as u"))
     @matviews begin
         n      = length(cx[1])
-        k      = zeros(typeof(cu[1]), N)
-        K      = zeros(typeof(copy(fu[1]')), N)
-        Vx     = zeros(typeof(cx[1]), N)
-        Vxx    = zeros(typeof(fx[1]), N)
-        Quu    = zeros(typeof(cuu[1]), N)
-        Quui   = zeros(typeof(cuu[1]), N)
+        k      = zeros(cu[1], N)
+        K      = zeros(copy(fu[1]'), N)
+        Vx     = zeros(cx[1], N)
+        Vxx    = zeros(fx[1], N)
+        Quu    = zeros(cuu[1], N)
+        Quui   = zeros(cuu[1], N)
         dV     = @SVector [0., 0.]
         Vx[N]  = cx[N]
         Vxx[N] = cxx[N]
@@ -100,17 +88,18 @@ function back_pass(cx,cu,cxx,cxu,cuu,fx,fu,fxx,fxu,fuu,λ,regType,lims,x,u) # no
             lower = lims[:,1]-u[i]
             upper = lims[:,2]-u[i]
             local k_i,result,free
-            try
-                k_i,result,R,free = boxQP(QuuF,QuF,lower,upper,k[min(i+1,N-1)])
-            catch
-                result = 0
-            end
+            # try
+                k_i,result,R,free = boxQP(QuuF,QuF,lower,upper,k[i+1])
+            # catch ex
+            #     @show ex
+            #     result = 0
+            # end
             if result < 1
                 diverge  = i
                 println("Failed boxQP")
                 return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx, dV
             end
-            K_i  = similar(K[i])
+            K_i  = 0*K[i]
             if any(free)
                 Lfree         = -R\(R'\Qux_reg[free,:])
                 K_i[free,:]   = Lfree
diff --git a/src/iLQG.jl b/src/iLQG.jl
index dc9bb97..ba1a930 100644
--- a/src/iLQG.jl
+++ b/src/iLQG.jl
@@ -183,7 +183,6 @@ function iLQG(f,costfun,df, x0, u0;
             debug("# test different backtracing parameters α and break loop when first succeeds")
             x,un,cost, = forward_pass(traj_new,x0,αi.*u,[],1,f,costfun, lims,diff_fun)
             debug("# simplistic divergence test")
-            @show typeof(x)
             if all(all(abs.(x) .< 1e8) for x in x)
                 u = un
                 diverge = false
diff --git a/src/system_pendcart.jl b/src/system_pendcart.jl
index 8298d63..f471f53 100644
--- a/src/system_pendcart.jl
+++ b/src/system_pendcart.jl
@@ -40,27 +40,27 @@ Run the iLQG Function to find an optimal trajectory For the "pendulum on a cart
 `doplot = true`                 : Plot results
 """
 function demo_pendcart(;x0 = [π-0.6,0,0,0], goal = [π,0,0,0],
-    Q      = Diagonal([10.,1,2,1]), # State weight matrix
-    R      = 1.,                    # Control weight matrix
-    lims   = 5.0*[-1 1],            # control limits,
-    T      = 600,                   # Number of time steps
-    doplot = true                   # Plot results
-    )
+                        Q      = Diagonal([10.,1,2,1]), # State weight matrix
+                        R      = 1.,                    # Control weight matrix
+                        lims   = 5.0*[-1 1],            # control limits,
+                        T      = 600,                   # Number of time steps
+                        doplot = true                   # Plot results
+                        )
 
     N = T+1
     g = 9.82
     l = 0.35 # Length of pendulum
     h = 0.01 # Sample time
     d = 0.99
-    A = [0 1 0 0; # Linearlized system dynamics matrix, continuous time
+    A = @SMatrix [0 1 0 0; # Linearlized system dynamics matrix, continuous time
     g/l -d 0 0;
     0 0 0 1;
     0 0 0 0]
-    B = [0, -1/l, 0, 1]
+    B = @SMatrix [0; -1/l; 0; 1]
     C = eye(4) # Assume all states are measurable
     D = 4
     L = lqr(A,B,Q,R) # Calculate the optimal state feedback
-    I = T
+
 
     function fsys_closedloop(t,x,L,xd)
         dx = copy(x)
@@ -90,12 +90,12 @@ function demo_pendcart(;x0 = [π-0.6,0,0,0], goal = [π,0,0,0],
 
 
     function cost_quadratic(x,u)
-        local d = (x .- Ref(goal))
+        local d = (x - goal)
         0.5(d'*Q*d + u'R*u)[1]
     end
 
-    function cost_quadratic(x::Matrix,u)
-        local d = (x.-goal)
+    function cost_quadratic(x::Vector{<:AbstractVector},u)
+        local d = (x .- Ref(goal))
         T = length(u)
         c = Vector{Float64}(undef,T+1)
         for t = 1:T
@@ -109,7 +109,7 @@ function demo_pendcart(;x0 = [π-0.6,0,0,0], goal = [π,0,0,0],
     function dcost_quadratic(x,u)
         cx  = Ref(Q) .* (x .- Ref(goal))
         cu  = Ref(R) .*u
-        return cx,cu,zeros(D)
+        return cx,cu,zeros(D,1)
     end
 
 
@@ -119,15 +119,15 @@ function demo_pendcart(;x0 = [π-0.6,0,0,0], goal = [π,0,0,0],
 
 
 
-        fxc = Float64[0 1 0 0;
-        0 0 0 0;
-        0 0 0 1;
-        0 0 0 0]
-        fuc = reshape(Float64[0, 0, 0, 1], 4,1)
-        fxc = [copy(fxc) for i = 1:N]
-        fuc = [copy(fuc) for i = 1:N]
-        fxd = deepcopy(fxc)
-        fud = deepcopy(fuc)
+    fxc = Float64[0 1 0 0;
+                0 0 0 0;
+                0 0 0 1;
+                0 0 0 0]
+    fuc = reshape(Float64[0, 0, 0, 1], 4,1)
+    fxc = [copy(fxc) for i = 1:N]
+    fuc = [copy(fuc) for i = 1:N]
+    fxd = deepcopy(fxc)
+    fud = deepcopy(fuc)
 
 
     function lin_dyn_df(x,u)
@@ -180,24 +180,24 @@ function demo_pendcart(;x0 = [π-0.6,0,0,0], goal = [π,0,0,0],
 
 
     # Simulate the closed loop system with regular LQG control and watch it fail due to control limits
+    # x0 = [π-0.6,0,0,0]; goal = [π,0,0,0]
     x00, u00, cost00 = simulate_pendcart(x0, L, dfsys, cost_quadratic)
 
-
     f(x,u,i) = lin_dyn_f(x,u,i)
     df(x,u)  = lin_dyn_df(x,u)
     # plotFn(x)  = plot(squeeze(x,2)')
 
     println("Entering iLQG function")
     # subplot(n=4,nc=2)
-    x, u, L, Vx, Vxx, cost, trace = iLQG(f,cost_quadratic, df, x0, 0*u00,
+    x, u, L, Vx, Vxx, cost, trace = iLQG(f,cost_quadratic, df, x0, 0 .* u00,
     lims      = lims,
     # plotFn  = x -> Plots.subplot!(x'),
     regType   = 2,
-    α         = exp10.(range(0.2, stop=-3, length=6)),
+    α         = exp10.(LinRange(0.2, -3, 6)),
     λmax      = 1e15,
     verbosity = 3,
     tol_fun   = 1e-8,
-    tol_grad   = 1e-8,
+    tol_grad  = 1e-8,
     max_iter  = 1000);
 
     doplot && plotstuff_pendcart(x00, u00, x,u,cost00,cost,trace)

From 1978d4302422465b75ff30d2071176ac8e418271 Mon Sep 17 00:00:00 2001
From: Fredrik Bagge Carlson <cont-frb@ulund.org>
Date: Fri, 7 Jun 2019 09:24:27 +0200
Subject: [PATCH 3/3] more WIP

---
 src/DifferentialDynamicProgramming.jl |   6 +-
 src/backward_pass.jl                  | 107 +++++++++++++-------------
 src/demo_linear.jl                    |  59 +++++++-------
 src/forward_pass.jl                   |  14 ++--
 src/iLQG.jl                           |   2 +-
 src/iLQGkl.jl                         |  18 +++--
 src/klutils.jl                        |  58 +++++++-------
 src/system_pendcart.jl                |  47 ++++-------
 8 files changed, 150 insertions(+), 161 deletions(-)

diff --git a/src/DifferentialDynamicProgramming.jl b/src/DifferentialDynamicProgramming.jl
index 1a193ad..ffeeeee 100644
--- a/src/DifferentialDynamicProgramming.jl
+++ b/src/DifferentialDynamicProgramming.jl
@@ -6,7 +6,9 @@ const DEBUG = false # Set this flag to true in order to print debug messages
 
 export QPTrace, boxQP, demoQP, iLQG,iLQGkl, demo_linear, demo_linear_kl, demo_pendcart, GaussianPolicy, tosvec
 
-tosvec(x) = reinterpret(SVector{size(x,1),eltype(x)}, x)[:]
+tosvec(x::AbstractMatrix{<:Number}) = reinterpret(SVector{size(x,1),eltype(x)}, x)[:]
+tosvec(x::AbstractVector{<:Number}) = SVector{length(x),eltype(x)}(x)
+tosmat(x::AbstractMatrix{<:Number}) = SMatrix{size(x,1),size(x,2),eltype(x)}(x)
 eye(n) = Matrix{Float64}(I,n,n)
 Base.strides(::SArray{Tuple{i,j}}) where {i,j} = (1,i)
 
@@ -16,7 +18,7 @@ function __init__()
     @eval function plotstuff_linear(x,u,cost,totalcost)
         p = Plots.plot(layout=(2,2))
         Plots.plot!(p,reduce(hcat,x)', title="State Trajectories", xlabel="Time step",legend=false, subplot=1, show=false)
-        Plots.plot!(p,cost,c=:black,linewidth=3, title="Cost", xlabel="Time step", subplot=2, show=false)
+        Plots.hline!(p,cost,c=:black,linewidth=3, title="Cost", xlabel="Time step", subplot=2, show=false)
         Plots.plot!(p,reduce(hcat,u)',title="Control signals", xlabel="Time step", subplot=3, show=false)
         Plots.plot!(p,totalcost,title="Total cost", xlabel="Iteration", subplot=4, show=false)
         Plots.gui()
diff --git a/src/backward_pass.jl b/src/backward_pass.jl
index 4ee5955..b1acb92 100644
--- a/src/backward_pass.jl
+++ b/src/backward_pass.jl
@@ -39,6 +39,7 @@ function back_pass(cx,cu,cxx,cxu,cuu,fx,fu,fxx,fxu,fuu,λ,regType,lims,x,u) # no
         Vx[N]  = cx[N]
         Vxx[N] = cxx[N]
         Quu[N] = cuu[N]
+        Quui[N] = inv(Quu[N])
         k[N]   = 0*cu[N]
         K[N]   = 0*fu[N]'
     end
@@ -106,12 +107,11 @@ function back_pass(cx,cu,cxx,cxu,cuu,fx,fu,fxx,fxu,fuu,λ,regType,lims,x,u) # no
             end
         end
         # debug("#  update cost-to-go approximation")
-        @matviews begin
-            dV    += [k_i'Qu; .5*k_i'Quu[i]*k_i]
-            Vx[i]  = Qx + K_i'Quu[i]*k_i + K_i'Qu + Qux'k_i
-            Vxx[i] = Qxx + K_i'Quu[i]*K_i + K_i'Qux + Qux'K_i
-            Vxx[i] = .5*(Vxx[i] + Vxx[i]')
-        end
+
+        dV    += [k_i'Qu; .5*k_i'Quu[i]*k_i]
+        Vx[i]  = Qx + K_i'Quu[i]*k_i + K_i'Qu + Qux'k_i
+        Vxx[i] = Qxx + K_i'Quu[i]*K_i + K_i'Qux + Qux'K_i
+        Vxx[i] = .5*(Vxx[i] + Vxx[i]')
 
         # debug("# save controls/gains")
         k[i] = k_i
@@ -127,55 +127,57 @@ function graphics(x...)
     return 0
 end
 
-function back_pass_gps(cx,cu,cxx::AbstractArray{T,3},cxu,cuu, fx::AbstractArray{T,3},fu,lims,x,u,kl_cost_terms) where T # quadratic timeVariant cost, linear time variant dynamics
-    m          = size(u,1)
-    n,_,N      = size(fx)
+function back_pass_gps(cx,cu,cxx,cxu,cuu, fx,fu,lims,x,u,kl_cost_terms)
+    m,N        = length(u[1]),length(u)
     ηbracket = kl_cost_terms[2]
     η = isa(ηbracket,AbstractMatrix) ? ηbracket[2,N] : ηbracket[2]
     cxkl,cukl,cxxkl,cxukl,cuukl = kl_cost_terms[1]
 
-    @assert size(cx) == (n, N)
-    @assert size(cu) == (m, N)
-    @assert size(cxx) == (n, n, N)
-    @assert size(cxu) == (n, m, N)
-    @assert size(cuu) == (m, m, N)
-
-    k          = zeros(m,N)
-    K          = zeros(m,n,N)
-    Vx         = zeros(n,N)
-    Vxx        = zeros(n,n,N)
-    Quu        = Array{T}(undef,m,m,N)
-    Quui       = Array{T}(undef,m,m,N)
-    dV         = [0., 0.]
-
-    Vx[:,N]    = cx[:,N]
-    Vxx[:,:,N] = cxx[:,:,end]
-    Quu[:,:,N] = cuu[:,:,N]./ η .+ cuukl[:,:,N]
-    Quui[:,:,N] = inv(Quu[:,:,N])
+    @show length.((cx,cu,cxx,cxu,cuu, fx,fu,lims,x,u))
+    @show length.((cxkl,cukl,cxxkl,cxukl,cuukl))
+
+    @matviews begin
+        n      = length(cx[1])
+        k      = zeros(cu[1], N)
+        K      = zeros(copy(fu[1]'), N)
+        Vx     = zeros(cx[1], N)
+        Vxx    = zeros(fx[1], N)
+        Quu    = zeros(cuu[1], N)
+        Quui   = zeros(cuu[1], N)
+        dV     = @SVector [0., 0.]
+        Vx[N]  = cx[N]
+        Vxx[N] = cxx[N]
+        Quu[N] = cuu[N]./ η .+ cuukl[N]
+        Quui[N] = inv(Quu[N])
+        k[N]   = 0*cu[N]
+        K[N]   = 0*fu[N]'
+    end
 
     diverge    = 0
     for i = N-1:-1:1
-        Qu          = cu[:,i] + fu[:,:,i]'Vx[:,i+1]
-        Qx          = cx[:,i] + fx[:,:,i]'Vx[:,i+1]
-        Qux         = cxu[:,:,i]' + fu[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-        Quu[:,:,i] .= cuu[:,:,i] .+ fu[:,:,i]'Vxx[:,:,i+1]*fu[:,:,i]
-        Qxx         = cxx[:,:,i]  + fx[:,:,i]'Vxx[:,:,i+1]*fx[:,:,i]
-
         ηbracket = kl_cost_terms[2]
         η = isa(ηbracket,AbstractMatrix) ? ηbracket[2,i] : ηbracket[2]
-        Qu         = Qu ./ η + cukl[:,i]
-        Qux        = Qux ./ η + cxukl[:,:,i]
-        Quu[:,:,i] = Quu[:,:,i] ./ η + cuukl[:,:,i]
-        Qx         = Qx ./ η + cxkl[:,i]
-        Qxx        = Qxx ./ η + cxxkl[:,:,i]
-
-        Quu[:,:,i] = 0.5*(Quu[:,:,i] + Quu[:,:,i]')
+        @matviews begin
+            Qu          = cu[i] + fu[i]'Vx[i+1]
+            Qx          = cx[i] + fx[i]'Vx[i+1]
+            Qux         = cxu[i]' + fu[i]'Vxx[i+1]*fx[i]
+            Quu[i]      = cuu[i] + fu[i]'Vxx[i+1]*fu[i]
+            Qxx         = cxx[i]  + fx[i]'Vxx[i+1]*fx[i]
+
+            Qu         = Qu ./ η + cukl[i]
+            Qux        = Qux ./ η + cxukl[i]
+            Quu[i] = Quu[i] ./ η + cuukl[i]
+            Qx         = Qx ./ η + cxkl[i]
+            Qxx        = Qxx ./ η + cxxkl[i]
+
+            Quu[i] = 0.5*(Quu[i] + Quu[i]')
+        end
 
         if isempty(lims) || lims[1,1] > lims[1,2]
             # debug("#  no control limits: Cholesky decomposition, check for non-PD")
             local R
             try
-                R = cholesky(Hermitian(Quu[:,:,i]))
+                R = cholesky(Hermitian(Quu[i]))
             catch
                 diverge  = i
                 return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx, dV
@@ -186,11 +188,11 @@ function back_pass_gps(cx,cu,cxx::AbstractArray{T,3},cxu,cuu, fx::AbstractArray{
             K_i = -(R\Qux)
         else
             # debug("#  solve Quadratic Program")
-            lower = lims[:,1]-u[:,i]
-            upper = lims[:,2]-u[:,i]
+            lower = lims[:,1]-u[i]
+            upper = lims[:,2]-u[i]
             local k_i,result,free
             try
-                k_i,result,R,free = boxQP(Quu[:,:,i],Qu,lower,upper,k[:,min(i+1,N-1)])
+                k_i,result,R,free = boxQP(Quu[i],Qu,lower,upper,k[i+1])
             catch
                 result = 0
             end
@@ -198,7 +200,7 @@ function back_pass_gps(cx,cu,cxx::AbstractArray{T,3},cxu,cuu, fx::AbstractArray{
                 diverge  = i
                 return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx, dV
             end
-            K_i  = zeros(m,n)
+            K_i = 0*K[i]
             if any(free)
                 Lfree         = -R\(R'\Qux[free,:])
                 K_i[free,:]   = Lfree
@@ -206,15 +208,16 @@ function back_pass_gps(cx,cu,cxx::AbstractArray{T,3},cxu,cuu, fx::AbstractArray{
         end
         # debug("#  update cost-to-go approximation")
 
-        dV         = dV  + [k_i'Qu; .5*k_i'Quu[:,:,i]*k_i]
-        Vx[:,i]    = Qx  + K_i'Quu[:,:,i]*k_i + K_i'Qu + Qux'k_i
-        Vxx[:,:,i] = Qxx + K_i'Quu[:,:,i]*K_i + K_i'Qux + Qux'K_i
-        Vxx[:,:,i] = .5*(Vxx[:,:,i] + Vxx[:,:,i]')
+        dV    += [k_i'Qu; .5*k_i'Quu[i]*k_i]
+        Vx[i]  = Qx  + K_i'Quu[i]*k_i + K_i'Qu + Qux'k_i
+        Vxx[i] = Qxx + K_i'Quu[i]*K_i + K_i'Qux + Qux'K_i
+        Vxx[i] = .5*(Vxx[i] + Vxx[i]')
+
 
         # debug("# save controls/gains")
-        k[:,i]   = k_i
-        K[:,:,i] = K_i
-        Quui[:,:,i] = inv(Quu[:,:,i])
+        k[i] = k_i
+        K[i] = K_i
+        Quui[i] = inv(Quu[i])
     end
 
     return diverge, GaussianPolicy(N,n,m,K,k,Quui,Quu), Vx, Vxx,dV
diff --git a/src/demo_linear.jl b/src/demo_linear.jl
index 257a0d3..33d6cec 100644
--- a/src/demo_linear.jl
+++ b/src/demo_linear.jl
@@ -22,8 +22,8 @@ function demo_linear(;kwargs...)
     lims = []            #ones(m,1)*[-1 1]*.6
 
     T    = 1000          # horizon
-    x0   = ones(n,1)     # initial state
-    u0   = .1*randn(m,T) # initial controls
+    x0   = ones(n)     # initial state
+    u0   = .1*randn(m,T) |> tosvec # initial controls
 
     # optimization problem
     N    = T+1
@@ -33,20 +33,19 @@ function demo_linear(;kwargs...)
     cxu  = zeros(size(B))
     cuu  = R
     function lin_dyn_df(x,u,Q,R)
-        u[isnan.(u)] .= 0
-        cx  = Q*x
-        cu  = R*u
-        fxx=fxu=fuu = []
+        cx  = Ref(Q) .* x
+        cu  = Ref(R) .* u
+        fxx=fxu=fuu = nothing
         return fx,fu,fxx,fxu,fuu,cx,cu,cxx,cxu,cuu
     end
     function lin_dyn_f(x,u,A,B,Q,R)
-        u[isnan.(u)] .= 0
-        xnew = A*x + B*u
-        return xnew
+        A*x + B*u
     end
-    lin_dyn_fT(x,Q) = 0.5*sum(x.*(Q*x))
+    quadform(x::AbstractVector{<:AbstractVector},Q) = sum(x'Q*x for x in x)
+    quadform(x,Q) = x'Q*x
+    lin_dyn_fT(x,Q) = 0.5*quadform(x,Q)
     f(x,u,i)        = lin_dyn_f(x,u,A,B,Q,R)
-    costfun(x,u)    = 0.5*sum(x.*(Q*x)) + 0.5*sum(u.*(R*u))
+    costfun(x,u)    = 0.5*quadform(x,Q) + 0.5*quadform(u,R)
     df(x,u)         = lin_dyn_df(x,u,Q,R)
     # plotFn(x)  = plot(squeeze(x,2)')
 
@@ -54,7 +53,6 @@ function demo_linear(;kwargs...)
     @time x, u, traj_new, Vx, Vxx, cost, otrace = iLQG(f,costfun,df, x0, u0; lims=lims,kwargs...);
 
     totalcost = get(otrace, :cost)[2]
-
     plotstuff_linear(x,u,[cost],totalcost)
     x, u, traj_new, Vx, Vxx, cost, otrace
 end
@@ -81,36 +79,37 @@ function demo_linear_kl(;kwargs...)
 
     T    = 1000          # horizon
     x0   = ones(n)       # initial state
-    u    = .1*randn(m,T) # initial controls
+    u    = .1*randn(m,T) |> tosvec # initial controls
 
     # optimization problem
     N    = T+1
-    fx   = repeat(A,1,1,T)
-    fu   = repeat(B,1,1,T)
-    cxx  = repeat(Q,1,1,T)
-    cxu  = repeat(zeros(size(B)),1,1,T)
-    cuu  = repeat(R,1,1,T)
+    fx   = [tosmat(A) for _ in 1:T]
+    fu   = [tosmat(B) for _ in 1:T]
+    cxx  = [tosmat(Q) for _ in 1:T]
+    cxu  = [tosmat(zeros(size(B))) for _ in 1:T]
+    cuu  = [tosmat(R) for _ in 1:T]
     function lin_dyn_df(x,u,Q,R)
-        u[isnan.(u)] .= 0
-        cx  = Q*x
-        cu  = R*u
-        fxx=fxu=fuu = []
+        cx  = Ref(Q) .* x
+        cu  = Ref(R) .* u
+        fxx=fxu=fuu = nothing
         return fx,fu,fxx,fxu,fuu,cx,cu,cxx,cxu,cuu
     end
     function lin_dyn_f(x,u,A,B,Q,R)
-        u[isnan.(u)] .= 0
-        xnew = A*x + B*u
-        return xnew
+        A*x + B*u
     end
+    quadform(x::AbstractVector{<:AbstractVector},Q) = sum(x'Q*x for x in x)
+    quadform(x,Q) = x'Q*x
+
     dyn = (x,u,i)   -> lin_dyn_f(x,u,A,B,Q,R)
-    costf = (x,u) -> 0.5*(sum(x.*(Q*x),dims=1) + sum(u.*(R*u),dims=1))[:]
+    costf = (x,u) -> 0.5*quadform(x,Q) + 0.5*quadform(u,R)
     diffdyn = (x,u)  -> lin_dyn_df(x,u,Q,R)
 
     function rollout(u)
-        x = zeros(n,T)
-        x[:,1] = x0
+        x = zeros(n,T) |> tosvec
+        @show typeof(x)
+        x[1] = x0
         for t = 1:T-1
-            x[:,t+1] = dyn(x[:,t],u[:,t],t)
+            x[t+1] = dyn(x[t],u[t],t)
         end
         x
     end
@@ -122,7 +121,7 @@ function demo_linear_kl(;kwargs...)
     local Vx, Vxx, cost, otrace, totalcost
     outercosts = zeros(5)
     @time for iter = 1:5
-        cost0 = 0.5*sum(x.*(Q*x)) + 0.5*sum(u.*(R*u))
+        cost0 = 0.5*(quadform(x,Q) +quadform(u,R))
         x, u, traj, Vx, Vxx, cost, otrace = iLQGkl(dyn,costf,diffdyn, x, traj, model; cost=cost0, lims=lims,kwargs...);
         totalcost = get(otrace, :cost)[2]
         outercosts[iter] = sum(totalcost)
diff --git a/src/forward_pass.jl b/src/forward_pass.jl
index 80b8ca8..05ac0b0 100644
--- a/src/forward_pass.jl
+++ b/src/forward_pass.jl
@@ -44,14 +44,14 @@ function forward_covariance(model, x, u, traj)
     Σ0       = R1 # TODO: I was lazy here
     ix = 1:n
     iu = n+1:n+m
-    sigmanew = Array{Float64}(undef,n+m,n+m,N)
-    sigmanew[ix,ix,1] = Σ0
+    sigmanew = [Array{Float64}(undef,n+m,n+m) for _ in 1:N]
+    sigmanew[1][ix,ix] = Σ0
     for i = 1:N-1
-        K,Σ = traj.K[i], traj.Σ[:,:,i]
-        sigmanew[ix,ix,i+1] = fx[i]*sigmanew[ix,ix,i]*fx[i]' + R1 # Iterate dLyap forward
-        sigmanew[iu,ix,i] = K*sigmanew[ix,ix,i]
-        sigmanew[ix,iu,i] = sigmanew[ix,ix,i]*K'
-        sigmanew[iu,iu,i] = K*sigmanew[ix,ix,i]*K' + Σ
+        K,Σ = traj.K[i], traj.Σ[i]
+        sigmanew[i+1][ix,ix] = fx[i]*sigmanew[i][ix,ix]*fx[i]' + R1 # Iterate dLyap forward
+        sigmanew[i][iu,ix] = K*sigmanew[i][ix,ix]
+        sigmanew[i][ix,iu] = sigmanew[i][ix,ix]*K'
+        sigmanew[i][iu,iu] = K*sigmanew[i][ix,ix]*K' + Σ
     end
     sigmanew
 end
diff --git a/src/iLQG.jl b/src/iLQG.jl
index ba1a930..be19945 100644
--- a/src/iLQG.jl
+++ b/src/iLQG.jl
@@ -50,7 +50,7 @@ eye(P,n) = Matrix{P}(I,n,n)
 GaussianPolicy(P) = GaussianPolicy(0,0,0,emptyMat3(P),emptyMat2(P),emptyMat3(P),emptyMat3(P))
 # GaussianPolicy(P,T,n,m) = GaussianPolicy(T,n,m,zeros(P,m,n,T),zeros(P,m,T),cat([eye(P,m) for t=1:T]..., dims=3),cat([eye(P,m) for t=1:T]..., dims=3))
 
-GaussianPolicy(P,T,n,m) = GaussianPolicy(0,0,0,zeros(SMatrix{m,n,P},T),zeros(SVector{m,P},T),zeros(SMatrix{m,m,P},T),zeros(SMatrix{m,m,P},T))
+GaussianPolicy(P,T,n,m) = GaussianPolicy(T,n,m,zeros(SMatrix{m,n,P},T),zeros(SVector{m,P},T),zeros(SMatrix{m,m,P},T),zeros(SMatrix{m,m,P},T))
 Base.isempty(gp::GaussianPolicy) = gp.T == gp.n == gp.m == 0
 Base.length(gp::GaussianPolicy) = gp.T
 
diff --git a/src/iLQGkl.jl b/src/iLQGkl.jl
index 8108206..fbdae24 100644
--- a/src/iLQGkl.jl
+++ b/src/iLQGkl.jl
@@ -46,8 +46,8 @@ function iLQGkl(dynamics,costfun,derivs, x0, traj_prev, model;
     # --- initial sizes and controls
     u            = copy(traj_prev.k) # initial control sequence
     n            = size(x0, 1) # dimension of state vector
-    m,N          = size(u) # dimension of control vector and number of state transitions
-    traj_new     = GaussianPolicy(Float64)
+    m,N          = length(u[1]),length(u) # dimension of control vector and number of state transitions
+    traj_new     = GaussianPolicy(Float64, N,n,m)
     k_old = copy(traj_prev.k)
     traj_prev.k *= 0 # We are adding new k to u, so must set this to zero for correct kl calculations
     ηbracket     = copy(ηbracket) # Because we do changes in this Array
@@ -62,7 +62,7 @@ function iLQGkl(dynamics,costfun,derivs, x0, traj_prev, model;
 
     # --- initial trajectory
     debug("Checking initial trajectory")
-    if size(x0,2) == N
+    if x0 isa Vector{<:AbstractVector}
         debug("# pre-rolled initial forward pass, initial traj provided")
         x        = x0
         diverge  = false
@@ -124,14 +124,16 @@ function iLQGkl(dynamics,costfun,derivs, x0, traj_prev, model;
         end
 
         #  check for termination due to small gradient
-        g_norm = mean(maximum(abs.(traj_new.k) ./ (abs.(u) .+1),dims=1))
+        g_norm = mean(zip(traj_new.k,u)) do (k,u)
+            maximum(abs.(k) ./ (abs.(u) .+ 1))
+        end
         trace(:grad_norm, iter, g_norm)
 
         # ====== STEP 3: Forward pass
 
         _t = @elapsed begin
             # debug("#  entering forward_pass")
-            xnew,unew,costnew = forward_pass(traj_new, x0[:,1] ,u, x,1,dynamics,costfun, lims, diff_fun)
+            xnew,unew,costnew = forward_pass(traj_new, x0[1] ,u, x,1,dynamics,costfun, lims, diff_fun)
             sigmanew = forward_covariance(model, x, u, traj_new)
             traj_new.k .+= traj_prev.k # unew = k_new + k_old + Knew*Δx, this doesn't matter since traj_prev.k set to 0 above
             Δcost    = sum(cost) - sum(costnew)
@@ -202,7 +204,7 @@ function iLQGkl(dynamics,costfun,derivs, x0, traj_prev, model;
                 end
             end
 
-            xnew,unew,costnew = forward_pass(traj_new, x0[:,1] ,u, x,1,dynamics,costfun, lims, diff_fun)
+            xnew,unew,costnew = forward_pass(traj_new, x0[1] ,u, x,1,dynamics,costfun, lims, diff_fun)
             sigmanew = forward_covariance(model, x, u, traj_new)
             traj_new.k .+= traj_prev.k # unew = k_new + k_old + Knew*Δx
             Δcost                 = sum(cost) - sum(costnew)
@@ -216,7 +218,9 @@ function iLQGkl(dynamics,costfun,derivs, x0, traj_prev, model;
             # println(maximum(constraint_violation), " ", extrema(η), " ", indmax(constraint_violation))
             # println(round.(constraint_violation,4))
             η                    .= clamp.(η, ηbracket[1,:], ηbracket[3,:])
-            g_norm                = mean(maximum(abs.(traj_new.k) ./ (abs.(u) .+1),dims=1))
+            g_norm                = mean(zip(traj_new.k,u)) do (k,u)
+                maximum(abs.(k) ./ (abs.(u) .+ 1))
+            end
             trace(:grad_norm, iter, g_norm)
             # @show maximum(constraint_violation)
             if all(divergence .< 2*kl_step) && mean(constraint_violation) < 0.1*kl_step[1]
diff --git a/src/klutils.jl b/src/klutils.jl
index 1ebf4e3..ec533c0 100644
--- a/src/klutils.jl
+++ b/src/klutils.jl
@@ -9,15 +9,15 @@ function ∇kl(traj_prev)
     isempty(traj_prev) && (return (0,0,0,0,0))
     debug("Calculating KL cost addition terms")
     m,n,T  = traj_prev.m,traj_prev.n,traj_prev.T
-    cx,cu,cxx,cuu,cxu = zeros(n,T),zeros(m,T),zeros(n,n,T),zeros(m,m,T),zeros(m,n,T)
+    cx,cu,cxx,cuu,cxu = [zeros(n) for _ in 1:T],[zeros(m) for _ in 1:T],[zeros(n,n) for _ in 1:T],[zeros(m,m) for _ in 1:T],[zeros(m,n) for _ in 1:T]
     for t in 1:T
-        K, k       = traj_prev.K[:,:,t], traj_prev.k[:,t]
-        Σi         = traj_prev.Σi[:,:,t]
-        cx[:,t]    = K'*Σi*k
-        cu[:,t]    = -Σi*k
-        cxx[:,:,t] = K'*Σi*K
-        cuu[:,:,t] = Σi
-        cxu[:,:,t] = -Σi*K # https://github.com/cbfinn/gps/blob/master/python/gps/algorithm/traj_opt/traj_opt_lqr_python.py#L355
+        K, k       = traj_prev.K[t], traj_prev.k[t]
+        Σi         = traj_prev.Σi[t]
+        cx[t]    = K'*Σi*k
+        cu[t]    = -Σi*k
+        cxx[t] = K'*Σi*K
+        cuu[t] = Σi
+        cxu[t] = -Σi*K # https://github.com/cbfinn/gps/blob/master/python/gps/algorithm/traj_opt/traj_opt_lqr_python.py#L355
     end
     return cx,cu,cxx,cxu,cuu
 end
@@ -43,16 +43,16 @@ function kl_div(xnew,xold, Σ_new, traj_new, traj_prev)
     # m     = traj_new.m
     kldiv = zeros(T)
     for t = 1:T
-        μt    = μ_new[:,t]
-        Σt    = Σ_new[:,:,t]
-        Kp    = traj_prev.K[:,:,t]
-        Kn    = traj_new.K[:,:,t]
-        kp    = traj_prev.k[:,t]
-        kn    = traj_new.k[:,t] + kp # unew must be added here
-        Σp    = traj_prev.Σ[:,:,t]
-        Σn    = traj_new.Σ[:,:,t]
-        Σip   = traj_prev.Σi[:,:,t]
-        Σin   = traj_new.Σi[:,:,t]
+        μt    = μ_new[t]
+        Σt    = Σ_new[t]
+        Kp    = traj_prev.K[t]
+        Kn    = traj_new.K[t]
+        kp    = traj_prev.k[t]
+        kn    = traj_new.k[t] + kp # unew must be added here
+        Σp    = traj_prev.Σ[t]
+        Σn    = traj_new.Σ[t]
+        Σip   = traj_prev.Σi[t]
+        Σin   = traj_new.Σi[t]
         Mp,vp = KLmv(Σip,Kp,kp)
         Mn,vn = KLmv(Σin,Kn,kn)
         cp    = .5*kp'Σip*kp
@@ -72,16 +72,16 @@ function kl_div_wiki(xnew,xold, Σ_new, traj_new, traj_prev)
     T,m,n     = traj_new.T, traj_new.m, traj_new.n
     kldiv = zeros(T)
     for t = 1:T
-        μt     = μ_new[:,t]
-        Σt     = Σ_new[1:n,1:n,t]
-        Kp     = traj_prev.K[:,:,t]
-        Kn     = traj_new.K[:,:,t]
-        kp     = traj_prev.k[:,t]
-        kn     = traj_new.k[:,t] #traj_new.k[:,t] contains kp already
-        Σp     = traj_prev.Σ[:,:,t]
-        Σn     = traj_new.Σ[:,:,t]
-        Σip    = traj_prev.Σi[:,:,t]
-        Σin    = traj_new.Σi[:,:,t]
+        μt     = μ_new[t]
+        Σt     = Σ_new[t][1:n,1:n]
+        Kp     = traj_prev.K[t]
+        Kn     = traj_new.K[t]
+        kp     = traj_prev.k[t]
+        kn     = traj_new.k[t] #traj_new.k[t] contains kp already
+        Σp     = traj_prev.Σ[t]
+        Σn     = traj_new.Σ[t]
+        Σip    = traj_prev.Σi[t]
+        Σin    = traj_new.Σi[t]
         dim    = m
         k_diff = kp-kn
         K_diff = Kp-Kn
@@ -101,7 +101,7 @@ end
 
 
 
-entropy(traj::GaussianPolicy) = mean(logdet(traj.Σ[:,:,t])/2 for t = 1:traj.T) + traj.m*log(2π)/2
+entropy(traj::GaussianPolicy) = mean(logdet(traj.Σ[t])/2 for t = 1:traj.T) + traj.m*log(2π)/2
 
 """
 new_η, satisfied, divergence = calc_η(xnew,xold,sigmanew,η, traj_new, traj_prev, kl_step)
diff --git a/src/system_pendcart.jl b/src/system_pendcart.jl
index f471f53..4e546f9 100644
--- a/src/system_pendcart.jl
+++ b/src/system_pendcart.jl
@@ -26,7 +26,7 @@ end
 
 
 """
-    demo_pendcart(;kwargs...)
+demo_pendcart(;kwargs...)
 
 Run the iLQG Function to find an optimal trajectory For the "pendulum on a cart system". Requires package ControlSystems.jl
 
@@ -40,12 +40,12 @@ Run the iLQG Function to find an optimal trajectory For the "pendulum on a cart
 `doplot = true`                 : Plot results
 """
 function demo_pendcart(;x0 = [π-0.6,0,0,0], goal = [π,0,0,0],
-                        Q      = Diagonal([10.,1,2,1]), # State weight matrix
-                        R      = 1.,                    # Control weight matrix
-                        lims   = 5.0*[-1 1],            # control limits,
-                        T      = 600,                   # Number of time steps
-                        doplot = true                   # Plot results
-                        )
+    Q      = Diagonal([10.,1,2,1]), # State weight matrix
+    R      = 1.,                    # Control weight matrix
+    lims   = 5.0*[-1 1],            # control limits,
+    T      = 600,                   # Number of time steps
+    doplot = true                   # Plot results
+    )
 
     N = T+1
     g = 9.82
@@ -62,30 +62,11 @@ function demo_pendcart(;x0 = [π-0.6,0,0,0], goal = [π,0,0,0],
     L = lqr(A,B,Q,R) # Calculate the optimal state feedback
 
 
-    function fsys_closedloop(t,x,L,xd)
-        dx = copy(x)
-        dx[1] -= pi
-        u = -(L*dx)[1]
-        xd[1] = x[2]
-        xd[2] = -g/l * sin(x[1]) + u/l * cos(x[1]) - d*x[2]
-        xd[3] = x[4]
-        xd[4] = u
-    end
-
-    function fsys(t,x,u,xd)
-        xd[1] = x[2]
-        xd[2] = -g/l * sin(x[1]) + u/l * cos(x[1]) - d*x[2]
-        xd[3] = x[4]
-        xd[4] = u
-    end
-
-    dfvec = zeros(4)
     function dfsys(x,u)
-        dfvec[1] = x[1]+h*x[2]
-        dfvec[2] = x[2]+h*(-g/l*sin(x[1])+u[1]/l*cos(x[1])- d*x[2])
-        dfvec[3] = x[3]+h*x[4]
-        dfvec[4] = x[4]+h*u[1]
-        dfvec
+        SVector(x[1]+h*x[2],
+        x[2]+h*(-g/l*sin(x[1])+u[1]/l*cos(x[1])- d*x[2]),
+        x[3]+h*x[4],
+        x[4]+h*u[1])
     end
 
 
@@ -120,9 +101,9 @@ function demo_pendcart(;x0 = [π-0.6,0,0,0], goal = [π,0,0,0],
 
 
     fxc = Float64[0 1 0 0;
-                0 0 0 0;
-                0 0 0 1;
-                0 0 0 0]
+    0 0 0 0;
+    0 0 0 1;
+    0 0 0 0]
     fuc = reshape(Float64[0, 0, 0, 1], 4,1)
     fxc = [copy(fxc) for i = 1:N]
     fuc = [copy(fuc) for i = 1:N]