From e16759421213f22f7100d206dc0f154a0318ee31 Mon Sep 17 00:00:00 2001
From: Thomas Wutzler <twutz@bgc-jena.mpg.de>
Date: Mon, 23 Dec 2024 20:01:22 +0100
Subject: [PATCH 1/6] test sample_zeta

---
 Project.toml                      |   8 ++-
 src/HybridVariationalInference.jl |   8 +++
 src/elbo.jl                       |  96 ++++++++++++++++++++++++++
 src/util._transformvariablesjl    |  21 ++++++
 test/Project.toml                 |   2 +-
 test/runtests.jl                  |   2 +
 test/test_elbo.jl                 | 109 ++++++++++++++++++++++++++++++
 test/test_sample_zeta.jl          |  99 +++++++++++++++++++++++++++
 8 files changed, 343 insertions(+), 2 deletions(-)
 create mode 100644 src/elbo.jl
 create mode 100644 src/util._transformvariablesjl
 create mode 100644 test/test_elbo.jl
 create mode 100644 test/test_sample_zeta.jl

diff --git a/Project.toml b/Project.toml
index a0f407d..0c86f31 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Thomas Wutzler <twutz@bgc-jena.mpg.de> and contributors"]
 version = "1.0.0-DEV"
 
 [deps]
+BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -13,6 +14,8 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
+TransformVariables = "84d833dd-6860-57f9-a1a7-6da5db126cff"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [weakdeps]
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
@@ -25,8 +28,9 @@ HybridVariationalInferenceLuxExt = "Lux"
 HybridVariationalInferenceSimpleChainsExt = "SimpleChains"
 
 [compat]
-ChainRulesCore = "1.25"
+BlockDiagonals = "0.1.42"
 CUDA = "5.5.2"
+ChainRulesCore = "1.25"
 Combinatorics = "1.0.2"
 ComponentArrays = "0.15.19"
 Flux = "v0.15.2"
@@ -37,6 +41,8 @@ Random = "1.10.0"
 SimpleChains = "0.4"
 StatsBase = "0.34.4"
 StatsFuns = "1.3.2"
+TransformVariables = "0.8.10"
+Zygote = "0.6.73"
 julia = "1.10"
 
 [workspace]
diff --git a/src/HybridVariationalInference.jl b/src/HybridVariationalInference.jl
index 1d7b2bb..7dee309 100644
--- a/src/HybridVariationalInference.jl
+++ b/src/HybridVariationalInference.jl
@@ -8,6 +8,12 @@ using GPUArraysCore
 using LinearAlgebra
 using CUDA
 using ChainRulesCore
+using TransformVariables
+using Zygote  # Zygote.@ignore CUDA.randn
+using BlockDiagonals
+
+export inverse_ca
+include("util._transformvariablesjl")
 
 export ComponentArrayInterpreter, flatten1, get_concrete
 include("ComponentArrayInterpreter.jl")
@@ -32,6 +38,8 @@ include("util_opt.jl")
 #export - all internal
 include("cholesky.jl")
 
+include("elbo.jl")
+
 export DoubleMM
 include("DoubleMM/DoubleMM.jl")
 
diff --git a/src/elbo.jl b/src/elbo.jl
new file mode 100644
index 0000000..db0cd97
--- /dev/null
+++ b/src/elbo.jl
@@ -0,0 +1,96 @@
+"""
+Generate samples of (inv-transformed) model parameters, ζ, and Log-Determinant
+of their distribution.
+
+Adds the MV-normally distributed residuals, retrieved by `sample_ζ_norm0`
+to the means extracted from parameters and predicted by the machine learning
+model. 
+"""
+function generate_ζ(rng, g, f, ϕ::AbstractVector, x::AbstractMatrix,
+    interpreters::NamedTuple; n_MC=3)
+    # see documentation of neg_elbo_transnorm_gf
+    ϕc = interpreters.μP_ϕg_unc(CA.getdata(ϕ))
+    μ_ζP = ϕc.μP
+    ϕg = ϕc.ϕg
+    μ_ζMs0 = g(x, ϕg) # TODO provide μ_ζP to g
+    ζ_resid, logdetΣ = sample_ζ_norm0(rng, μ_ζP, μ_ζMs0, ϕc.unc; n_MC)
+    #ζ_resid, logdetΣ = sample_ζ_norm0(rng, ϕ[1:2], reshape(ϕ[2 .+ (1:20)],2,:), ϕ[(end-length(interpreters.unc)+1):end], interpreters.unc; n_MC)
+    ζ = stack(map(eachcol(ζ_resid)) do r
+        rc = interpreters.PMs(r)
+        ζP = μ_ζP .+ rc.θP
+        μ_ζMs = μ_ζMs0 # g(x, ϕc.ϕ) # TODO provide ζP to g
+        ζMs = μ_ζMs .+ rc.θMs
+        vcat(ζP, vec(ζMs))
+    end)
+    ζ, logdetΣ
+end
+
+"""
+Extract relevant parameters from θ and return n_MC generted draws
+together with the logdet of the transformation.
+
+Necessary typestable information on number of compponents are provided with 
+ComponentMarshellers
+- marsh_pmu(n_θP, n_θMs, Unc=n_θUnc) 
+- marsh_batch(n_batch) 
+- marsh_unc(n_UncP, n_UncM, n_UncCorr)
+"""
+function sample_ζ_norm0(rng::Random.AbstractRNG, ζP::AbstractVector, ζMs::AbstractMatrix, ϕunc::AbstractVector, args...;
+    n_MC=3) 
+    n_θP, n_θMs = length(ζP), length(ζMs)
+    urand = _create_random(rng, CA.getdata(ζP), n_θP + n_θMs, n_MC)
+    sample_ζ_norm0(urand, ζP, ζMs, ϕunc, args...)
+end
+
+function sample_ζ_norm0(urand::AbstractMatrix, ζP::AbstractVector{T}, ζMs::AbstractMatrix, 
+    ϕunc::AbstractVector, int_unc = ComponentArrayInterpreter(ϕunc);
+    ) where {T}
+    ϕuncc = int_unc(CA.getdata(ϕunc))
+    n_θP, n_θMs, (n_θM, n_batch) = length(ζP), length(ζMs), size(ζMs) 
+    # make sure to not create a UpperTriangular Matrix of an CuArray in transformU_cholesky1
+    UP = transformU_cholesky1(ϕuncc.ρsP)
+    UM = transformU_cholesky1(ϕuncc.ρsM)
+    cf = ϕuncc.coef_logσ2_logMs
+    logσ2_logMs = vec(cf[1, :] .+ cf[2, :] .* ζMs)
+    logσ2_logP = vec(CA.getdata(ϕuncc.logσ2_logP))
+    # CUDA cannot multiply BlockDiagonal * Diagonal, construct already those blocks
+    σMs = reshape(exp.(logσ2_logMs ./ 2), n_θM, :)
+    σP = exp.(logσ2_logP ./ 2)
+    # BlockDiagonal does work with CUDA, but not with combination of Zygote and CUDA
+    # need to construct full matrix for CUDA
+    Uσ = _create_blockdiag(UP, UM, σP, σMs, n_batch) 
+    ζ_resid = Uσ' * urand
+    logdetΣ = 2 .* sum(log.(diag(Uσ)))
+    # returns CuArrays to either continue on GPU or need to transfer to CPU
+    ζ_resid, logdetΣ
+end
+
+function _create_blockdiag(UP::AbstractMatrix{T}, UM, σP, σMs, n_batch) where {T}
+    v = [i == 0 ? UP * Diagonal(σP) : UM * Diagonal(σMs[:, i]) for i in 0:n_batch]
+    BlockDiagonal(v)
+end
+function _create_blockdiag(UP::GPUArraysCore.AbstractGPUMatrix{T}, UM, σP, σMs, n_batch) where {T}
+    # using BlockDiagonal leads to Scalar operations downstream
+    # v = [i == 0 ? UP * Diagonal(σP) : UM * Diagonal(σMs[:, i]) for i in 0:n_batch]
+    # BlockDiagonal(v)    
+    # Uσ = cat([i == 0 ? UP * Diagonal(σP) : UM * Diagonal(σMs[:, i]) for i in 0:n_batch]...;
+    #     dims=(1, 2))
+    # on GPU use only one big multiplication rather than many small ones
+    U = cat([i == 0 ? UP : UM for i in 0:n_batch]...; dims=(1, 2))
+    #Main.@infiltrate_main
+    σD = Diagonal(vcat(σP, vec(σMs)))
+    Uσ = U * σD
+    # need for Zygote why?
+    # tmp = cat(Uσ; dims=(1,2))
+    tmp = vcat(Uσ)
+end
+
+function _create_random(rng, ::AbstractVector{T}, dims...) where {T}
+    rand(rng, T, dims...)
+end
+function _create_random(rng, ::GPUArraysCore.AbstractGPUVector{T}, dims...) where {T}
+    # ignores rng
+    # https://discourse.julialang.org/t/help-using-cuda-zygote-and-random-numbers/123458/4?u=bgctw
+    # Zygote.@ignore CUDA.randn(rng, dims...)
+    Zygote.@ignore CUDA.randn(dims...)
+end
\ No newline at end of file
diff --git a/src/util._transformvariablesjl b/src/util._transformvariablesjl
new file mode 100644
index 0000000..8e008b8
--- /dev/null
+++ b/src/util._transformvariablesjl
@@ -0,0 +1,21 @@
+"""
+Apply TransformVariables.inverse to ComponentArray, `ca`.
+
+- convert `ca` to a `NamedTuple`
+- apply transformation
+- convert back to `ComponentArray`
+"""
+function inverse_ca(trans, ca::CA.AbstractArray)
+    CA.ComponentArray(
+        TransformVariables.inverse(trans, cv2NamedTuple(ca)),
+        CA.getaxes(ca))
+end
+
+"""
+Convert ComponentVector to NamedTuple of the first layer, i.e. keep 
+ComponentVectors in the second level.
+"""
+function cv2NamedTuple(ca::CA.ComponentVector)
+    g = ((k, CA.getdata(ca[k])) for k in keys(ca))
+    (; g...)
+end
\ No newline at end of file
diff --git a/test/Project.toml b/test/Project.toml
index 3006d5b..97ffeaf 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -15,5 +15,5 @@ StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TransformVariables = "84d833dd-6860-57f9-a1a7-6da5db126cff"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
diff --git a/test/runtests.jl b/test/runtests.jl
index 49f0f37..18434b3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -13,6 +13,8 @@ const GROUP = get(ENV, "GROUP", "All") # defined in in CI.yml
         @time @safetestset "test_doubleMM" include("test_doubleMM.jl")
         #@safetestset "test" include("test/test_cholesky_structure.jl")
         @time @safetestset "test_cholesky_structure" include("test_cholesky_structure.jl")
+        #@safetestset "test" include("test/test_sample_zeta.jl")
+        @time @safetestset "test_sample_zeta" include("test_sample_zeta.jl")
         #
         #@safetestset "test" include("test/test_Flux.jl")
         @time @safetestset "test_Flux" include("test_Flux.jl")
diff --git a/test/test_elbo.jl b/test/test_elbo.jl
new file mode 100644
index 0000000..62064f7
--- /dev/null
+++ b/test/test_elbo.jl
@@ -0,0 +1,109 @@
+#using LinearAlgebra, BlockDiagonals
+
+using Test
+using Zygote
+using HybridVariationalInference
+using HybridVariationalInference: HybridVariationalInference as CP
+using StableRNGs
+using CUDA
+using GPUArraysCore: GPUArraysCore
+using Random
+using SimpleChains
+using ComponentArrays: ComponentArrays as CA
+using TransformVariables
+
+#CUDA.device!(4)
+rng = StableRNG(111)
+
+const case = DoubleMM.DoubleMMCase()
+const MLengine = Val(nameof(SimpleChains))
+scenario = (:default,)
+
+#θsite_true = get_hybridcase_par_templates(case; scenario)
+g, ϕg0 = gen_hybridcase_MLapplicator(case, MLengine; scenario);
+f = gen_hybridcase_PBmodel(case; scenario)
+
+(; n_covar, n_site, n_batch, n_θM, n_θP) = get_hybridcase_sizes(case; scenario)
+
+(; xM, θP_true, θMs_true, xP, y_global_true, y_true, y_global_o, y_o
+) = gen_hybridcase_synthetic(case, rng; scenario);
+
+# correlation matrices
+ρsP = zeros(sum(1:(n_θP-1)))
+ρsM = zeros(sum(1:(n_θM-1)))
+
+() -> begin
+    coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951]
+    logσ2_logP = CA.ComponentVector(r0=-8.997, K2=-5.893)
+    #mean_σ_o_MC = 0.006042
+
+    ϕunc = CA.ComponentVector(;
+        logσ2_logP=logσ2_logP,
+        coef_logσ2_logMs=coef_logσ2_logMs,
+        ρsP,
+        ρsM)
+end
+
+
+# for a conservative uncertainty assume σ2=1e-10 and no relationship with magnitude
+ϕunc0 = CA.ComponentVector(;
+    logσ2_logP=fill(-10.0, n_θP),
+    coef_logσ2_logMs=reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
+    ρsP,
+    ρsM)
+#int_unc = ComponentArrayInterpreter(ϕunc0)
+
+transPMs_batch = as(
+    (P=as(Array, asℝ₊, n_θP),
+    Ms=as(Array, asℝ₊, n_θM, n_batch)))
+transPMs_all = as(
+    (P=as(Array, asℝ₊, n_θP),
+    Ms=as(Array, asℝ₊, n_θM, n_site)))
+    
+ϕ_true = θ = CA.ComponentVector(;
+    μP=θP_true,
+    ϕg=ϕg0, #ϕg_opt,  # here start from randomized
+    unc=ϕunc);
+trans_gu = as(
+    (μP=as(Array, asℝ₊, n_θP),
+    ϕg=as(Array, length(ϕg0)),
+    unc=as(Array, length(ϕunc))))
+trans_g = as(
+    (μP=as(Array, asℝ₊, n_θP),
+    ϕg=as(Array, length(ϕg0))))
+
+int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP_true,
+    θMs=CA.ComponentMatrix(
+        zeros(n_θM, n_batch), first(CA.getaxes(θMs_true)), CA.Axis(i=1:n_batch))))
+
+interpreters = map(get_concrete,(; 
+    μP_ϕg_unc=ComponentArrayInterpreter(ϕ_true), 
+    PMs=int_PMs_batch,
+    unc=ComponentArrayInterpreter(ϕunc0)
+    ))
+
+ϕg_true_vec = CA.ComponentVector(
+    TransformVariables.inverse(trans_gu, CP.cv2NamedTuple(ϕ_true)))
+ϕcg_true = interpreters.μP_ϕg_unc(ϕg_true_vec)
+ϕ_ini = ζ = vcat(ϕcg_true[[:μP, :ϕg]] .* 1.2, ϕcg_true[[:unc]]);
+ϕ_ini0 = ζ = vcat(ϕcg_true[:μP] .* 0.0, ϕg0, ϕunc0);
+
+neg_elbo_transnorm_gf(rng, g, f, ϕcg_true, y_o[:, 1:n_batch], xM[:, 1:n_batch],
+    transPMs_batch, map(get_concrete, interpreters);
+    n_MC=8, logσ2y)
+Zygote.gradient(ϕ -> neg_elbo_transnorm_gf(
+        rng, g, f, ϕ, y_o[:, 1:n_batch], x_o[:, 1:n_batch],
+        transPMs_batch, interpreters; n_MC=8, logσ2y), ϕcg_true)
+
+    @testset "generate_ζ" begin
+        ϕ = CA.getdata(ϕ_cpu)
+        n_sample_pred = 200
+        intm_PMs_gen = ComponentArrayInterpreter(CA.ComponentVector(; θP_true,
+            θMs=CA.ComponentMatrix(
+                zeros(n_θM, n_site), first(CA.getaxes(θMs_true)), CA.Axis(i=1:n_sample_pred))))
+        int_μP_ϕg_unc=ComponentArrayInterpreter(ϕ_true)
+        interpreters = (; PMs = intm_PMs_gen, μP_ϕg_unc = int_μP_ϕg_unc  )
+        ζs, _ = CP.generate_ζ(rng, g, f, ϕ, xM, interpreters; n_MC=n_sample_pred)
+        
+    end;
+
diff --git a/test/test_sample_zeta.jl b/test/test_sample_zeta.jl
new file mode 100644
index 0000000..6447fcd
--- /dev/null
+++ b/test/test_sample_zeta.jl
@@ -0,0 +1,99 @@
+#using LinearAlgebra, BlockDiagonals
+
+using Test
+using Zygote
+using HybridVariationalInference
+using HybridVariationalInference: HybridVariationalInference as CP
+using StableRNGs
+using CUDA
+using GPUArraysCore: GPUArraysCore
+using Random
+#using SimpleChains
+using ComponentArrays: ComponentArrays as CA
+using TransformVariables
+
+#CUDA.device!(4)
+rng = StableRNG(111)
+
+const case = DoubleMM.DoubleMMCase()
+#const MLengine = Val(nameof(SimpleChains))
+scenario = (:default,)
+
+(; n_covar, n_site, n_batch, n_θM, n_θP) = get_hybridcase_sizes(case; scenario)
+
+@testset "test_sample_zeta" begin
+    (; xM, θP_true, θMs_true, xP, y_global_true, y_true, y_global_o, y_o
+    ) = gen_hybridcase_synthetic(case, rng; scenario);
+
+    # n_site = 2
+    # n_θP, n_θM = length(θ_true.θP), length(θ_true.θM)
+    # σ_θM = θ_true.θM .* 0.1  # 10% around expected
+    # θMs_true = θ_true.θM .+ randn(n_θM, n_site) .* σ_θM 
+
+    # set to 0.02 rather than zero for debugging non-zero correlations
+    ρsP = zeros(sum(1:(n_θP-1))) .+ 0.02
+    ρsM = zeros(sum(1:(n_θM-1))) .+ 0.02   
+
+    ϕunc = CA.ComponentVector(;
+    logσ2_logP=fill(-10.0, n_θP),
+    coef_logσ2_logMs=reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
+    ρsP,
+    ρsM)
+
+    transPMs = as(
+        (P=as(Array, asℝ₊, n_θP),
+        Ms=as(Array, asℝ₊, n_θM, n_site)))
+    θ_true = θ = CA.ComponentVector(;
+        P=θP_true,
+        Ms=θMs_true)
+    transPMs = as((
+        P=as(Array, asℝ₊, n_θP),
+        Ms=as(Array, asℝ₊, n_θM, n_site)))
+    ζ_true = inverse_ca(transPMs, θ_true)
+    ϕ_true = vcat(ζ_true, CA.ComponentVector(unc=ϕunc))
+    ϕ_cpu = vcat(ζ_true .+ 0.01, CA.ComponentVector(unc=ϕunc))
+
+    interpreters = (; pmu=ComponentArrayInterpreter(ϕ_true)) #, M=int_θM, PMs=int_θPMs)
+
+    n_MC = 3
+    @testset "sample_ζ_norm0 cpu" begin
+        ϕ = CA.getdata(ϕ_cpu)
+        ϕc = interpreters.pmu(ϕ)
+        ζ_resid, logdetΣ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
+        @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site,  n_MC)
+        gr = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc)[1]), ϕc)[1] 
+        @test length(gr) == length(ϕ) 
+    end;
+    #
+    @testset "sample_ζ_norm0 gpu" begin
+        ϕ = CuArray(CA.getdata(ϕ_cpu));
+        #tmp = ϕ[1:6]
+        #vec2uutri(tmp)
+        ϕc = interpreters.pmu(ϕ);
+        @test CA.getdata(ϕc) isa GPUArraysCore.AbstractGPUArray
+        #ζP, ζMs, ϕunc = ϕc.P, ϕc.Ms, ϕc.unc
+        #urand = CUDA.randn(length(ϕc.P) + length(ϕc.Ms), n_MC) |> gpu
+        #include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss
+        #ζ_resid, logdetΣ = sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
+        #Zygote.gradient(ϕc -> sum(sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)[1]), ϕc)[1]; 
+        ζ_resid, logdetΣ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
+        @test ζ_resid isa GPUArraysCore.AbstractGPUArray
+        @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site,  n_MC)
+        gr = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)[1]), ϕc)[1]; 
+        @test length(gr) == length(ϕ) 
+        int_unc = ComponentArrayInterpreter(ϕc.unc)
+        gr2 = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(rng, CA.getdata(ϕc.P), CA.getdata(ϕc.Ms), CA.getdata(ϕc.unc), int_unc; n_MC)[1]), ϕc)[1]; 
+    end;
+
+    # @testset "generate_ζ" begin
+    #     ϕ = CA.getdata(ϕ_cpu)
+    #     n_sample_pred = 200
+    #     intm_PMs_gen = ComponentArrayInterpreter(CA.ComponentVector(; θP_true,
+    #         θMs=CA.ComponentMatrix(
+    #             zeros(n_θM, n_site), first(CA.getaxes(θMs_true)), CA.Axis(i=1:n_sample_pred))))
+    #     int_μP_ϕg_unc=ComponentArrayInterpreter(ϕ_true)
+    #     interpreters = (; PMs = intm_PMs_gen, μP_ϕg_unc = int_μP_ϕg_unc  )
+    #     ζs, _ = CP.generate_ζ(rng, g, f, ϕ, xM, interpreters; n_MC=n_sample_pred)
+        
+    # end;
+end;

From 4a1d5c2877f372128f55f9f77d412f3f38d19b90 Mon Sep 17 00:00:00 2001
From: Thomas Wutzler <twutz@bgc-jena.mpg.de>
Date: Thu, 9 Jan 2025 15:18:06 +0100
Subject: [PATCH 2/6] implement ELBO cost function

---
 dev/Project.toml                              |   4 +
 dev/doubleMM.jl                               | 281 ++++++++----------
 ext/HybridVariationalInferenceFluxExt.jl      |  34 ++-
 src/DoubleMM/f_doubleMM.jl                    |   1 +
 src/GPUDataHandler.jl                         |  31 ++
 src/HybridVariationalInference.jl             |   9 +-
 src/elbo.jl                                   |  75 ++++-
 src/logden_normal.jl                          |  62 ++++
 ...variablesjl => util_transformvariables.jl} |   0
 test/Project.toml                             |   2 +
 test/runtests.jl                              |   4 +
 test/test_Flux.jl                             |  28 +-
 test/test_elbo.jl                             | 147 +++++----
 test/test_logden_normal.jl                    |  42 +++
 14 files changed, 508 insertions(+), 212 deletions(-)
 create mode 100644 src/GPUDataHandler.jl
 create mode 100644 src/logden_normal.jl
 rename src/{util._transformvariablesjl => util_transformvariables.jl} (100%)
 create mode 100644 test/test_logden_normal.jl

diff --git a/dev/Project.toml b/dev/Project.toml
index 04a4a3a..162ab31 100644
--- a/dev/Project.toml
+++ b/dev/Project.toml
@@ -3,9 +3,12 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 HybridVariationalInference = "a108c475-a4e2-4021-9a84-cfa7df242f64"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationOptimisers = "42dfb2eb-d2b4-4451-abcd-913932933ac1"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SimpleChains = "de6bee2f-e2f4-4ec7-b6ed-219cc6f6e9e5"
@@ -15,3 +18,4 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 TransformVariables = "84d833dd-6860-57f9-a1a7-6da5db126cff"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
diff --git a/dev/doubleMM.jl b/dev/doubleMM.jl
index cb754a8..7f2a013 100644
--- a/dev/doubleMM.jl
+++ b/dev/doubleMM.jl
@@ -1,20 +1,24 @@
 using Test
 using HybridVariationalInference
+using HybridVariationalInference: HybridVariationalInference as HVI
 using StableRNGs
 using Random
 using Statistics
 using ComponentArrays: ComponentArrays as CA
 
 using SimpleChains
+import Flux # to allow for FluxMLEngine and cpu()
 using MLUtils
 import Zygote
 
+using CUDA
+using TransformVariables
 using OptimizationOptimisers
-
 using UnicodePlots
 
 const case = DoubleMM.DoubleMMCase()
 const MLengine = Val(nameof(SimpleChains))
+const FluxMLengine = Val(nameof(Flux))
 scenario = (:default,)
 rng = StableRNG(111)
 
@@ -22,26 +26,8 @@ par_templates = get_hybridcase_par_templates(case; scenario)
 
 (; n_covar, n_site, n_batch, n_θM, n_θP) = get_hybridcase_sizes(case; scenario)
 
-# const int_θP = ComponentArrayInterpreter(par_templates.θP)
-# const int_θM = ComponentArrayInterpreter(par_templates.θM)
-# const int_θPMs_flat = ComponentArrayInterpreter(P = n_θP, Ms = n_θM * n_batch)
-# const int_θ = ComponentArrayInterpreter(CA.ComponentVector(;θP=par_templates.θP,θM=par_templates.θM))
-# # moved to f_doubleMM
-# # const int_θdoubleMM = ComponentArrayInterpreter(flatten1(CA.ComponentVector(;θP,θM)))
-# # const S1 = [1.0, 1.0, 1.0, 0.3, 0.1]
-# # const S2 = [1.0, 3.0, 5.0, 5.0, 5.0]
-# θ = CA.getdata(vcat(par_templates.θP, par_templates.θM))
-
-# const int_θPMs = ComponentArrayInterpreter(CA.ComponentVector(;par_templates.θP,
-#     θMs=CA.ComponentMatrix(zeros(n_θM, n_batch), first(CA.getaxes(par_templates.θM)), CA.Axis(i=1:n_batch))))
-
-(; xM, θP_true, θMs_true, xP, y_global_true, y_true, y_global_o, y_o) = gen_hybridcase_synthetic(
-    case, rng; scenario);
-
-@test isapprox(
-    vec(mean(CA.getdata(θMs_true); dims = 2)), CA.getdata(par_templates.θM), rtol = 0.02)
-@test isapprox(vec(std(CA.getdata(θMs_true); dims = 2)),
-    CA.getdata(par_templates.θM) .* 0.1, rtol = 0.02)
+(; xM, θP_true, θMs_true, xP, y_global_true, y_true, y_global_o, y_o, σ_o
+) = gen_hybridcase_synthetic(case, rng; scenario);
 
 #----- fit g to θMs_true
 g, ϕg0 = gen_hybridcase_MLapplicator(case, MLengine; scenario);
@@ -58,21 +44,22 @@ Zygote.gradient(x -> loss_g(x, xM, g)[1], ϕg0);
 optf = Optimization.OptimizationFunction((ϕg, p) -> loss_g(ϕg, xM, g)[1],
     Optimization.AutoZygote())
 optprob = Optimization.OptimizationProblem(optf, ϕg0);
-res = Optimization.solve(optprob, Adam(0.02), callback = callback_loss(100), maxiters = 600);
+res = Optimization.solve(optprob, Adam(0.02), callback = callback_loss(100), maxiters = 800);
 
 ϕg_opt1 = res.u;
 loss_g(ϕg_opt1, xM, g)
 scatterplot(vec(θMs_true), vec(loss_g(ϕg_opt1, xM, g)[2]))
 @test cor(vec(θMs_true), vec(loss_g(ϕg_opt1, xM, g)[2])) > 0.9
 
-tmpf = () -> begin
+f = gen_hybridcase_PBmodel(case; scenario)
+
+() -> begin
     #----------- fit g and θP to y_o
     # end2end inversion
-    f = gen_hybridcase_PBmodel(case; scenario)
 
     int_ϕθP = ComponentArrayInterpreter(CA.ComponentVector(
         ϕg = 1:length(ϕg0), θP = par_templates.θP))
-    p = p0 = vcat(ϕg0, par_templates.θP .* 0.9);  # slightly disturb θP_true
+    p = p0 = vcat(ϕg0, par_templates.θP .* 0.9)  # slightly disturb θP_true
 
     # Pass the site-data for the batches as separate vectors wrapped in a tuple
     train_loader = MLUtils.DataLoader((xM, xP, y_o), batchsize = n_batch)
@@ -85,7 +72,7 @@ tmpf = () -> begin
     optprob = OptimizationProblem(optf, p0, train_loader)
 
     res = Optimization.solve(
-        optprob, Adam(0.02), callback = callback_loss(100), maxiters = 1000);
+        optprob, Adam(0.02), callback = callback_loss(100), maxiters = 1000)
 
     l1, y_pred_global, y_pred, θMs = loss_gf(res.u, train_loader.data...)
     scatterplot(vec(θMs_true), vec(θMs))
@@ -94,108 +81,105 @@ tmpf = () -> begin
     hcat(par_templates.θP, int_ϕθP(res.u).θP)
 end
 
-#---------- HADVI
+#---------- HVI
 # TODO think about good general initializations
 coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951]
-logσ2_logP = CA.ComponentVector(r0=-8.997, K2=-5.893)
+logσ2_logP = CA.ComponentVector(r0 = -8.997, K2 = -5.893)
 mean_σ_o_MC = 0.006042
 
 # correlation matrices
-ρsP = zeros(sum(1:(n_θP-1)))
-ρsM = zeros(sum(1:(n_θM-1)))
+ρsP = zeros(sum(1:(n_θP - 1)))
+ρsM = zeros(sum(1:(n_θM - 1)))
 
 ϕunc = CA.ComponentVector(;
-    logσ2_logP=logσ2_logP,
-    coef_logσ2_logMs=coef_logσ2_logMs,
+    logσ2_logP = logσ2_logP,
+    coef_logσ2_logMs = coef_logσ2_logMs,
     ρsP,
     ρsM)
 int_unc = ComponentArrayInterpreter(ϕunc)
 
 # for a conservative uncertainty assume σ2=1e-10 and no relationship with magnitude
 ϕunc0 = CA.ComponentVector(;
-    logσ2_logP=fill(-10.0, n_θP),
-    coef_logσ2_logMs=reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
+    logσ2_logP = fill(-10.0, n_θP),
+    coef_logσ2_logMs = reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
     ρsP,
     ρsM)
 
-logσ2y = fill(2 * log(σ_o), size(y_o, 1))
+logσ2y = 2 .* log.(σ_o)
 n_MC = 3
 
-
-#-------------- ADVI with g inside cost function
-using CUDA
-using TransformVariables
-
 transPMs_batch = as(
-    (P=as(Array, asℝ₊, n_θP),
-    Ms=as(Array, asℝ₊, n_θM, n_batch)))
+    (P = as(Array, asℝ₊, n_θP),
+    Ms = as(Array, asℝ₊, n_θM, n_batch)))
 transPMs_all = as(
-    (P=as(Array, asℝ₊, n_θP),
-    Ms=as(Array, asℝ₊, n_θM, n_site)))
-    
-ϕ_true = θ = CA.ComponentVector(;
-    μP=θP_true,
-    ϕg=ϕg_opt,
-    unc=ϕunc);
+    (P = as(Array, asℝ₊, n_θP),
+    Ms = as(Array, asℝ₊, n_θM, n_site)))
+
+n_ϕg = length(ϕg_opt1)
+ϕt_true = θ = CA.ComponentVector(;
+    μP = θP_true,
+    ϕg = ϕg_opt1,
+    unc = ϕunc);
 trans_gu = as(
-    (μP=as(Array, asℝ₊, n_θP),
-    ϕg=as(Array, n_ϕg),
-    unc=as(Array, length(ϕunc))))
+    (μP = as(Array, asℝ₊, n_θP),
+    ϕg = as(Array, n_ϕg),
+    unc = as(Array, length(ϕunc))))
 trans_g = as(
-    (μP=as(Array, asℝ₊, n_θP),
-    ϕg=as(Array, n_ϕg)))
-
-const int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP,
-    θMs=CA.ComponentMatrix(
-        zeros(n_θM, n_batch), first(CA.getaxes(θM)), CA.Axis(i=1:n_batch))))
-
-interpreters = interpreters_g = map(get_concrete,(; 
-    μP_ϕg_unc=ComponentArrayInterpreter(ϕ_true), 
-    PMs=int_PMs_batch,
-    unc=ComponentArrayInterpreter(ϕunc)
+    (μP = as(Array, asℝ₊, n_θP),
+    ϕg = as(Array, n_ϕg)))
+
+#const 
+int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP = θP_true,
+    θMs = CA.ComponentMatrix(
+        zeros(n_θM, n_batch), first(CA.getaxes(θMs_true)), CA.Axis(i = 1:n_batch))))
+
+interpreters = interpreters_g = map(get_concrete,
+    (;
+        μP_ϕg_unc = ComponentArrayInterpreter(ϕt_true),
+        PMs = int_PMs_batch,
+        unc = ComponentArrayInterpreter(ϕunc)
     ))
 
-ϕg_true_vec = CA.ComponentVector(
-    TransformVariables.inverse(trans_gu, cv2NamedTuple(ϕ_true)))
-ϕcg_true = interpreters.μP_ϕg_unc(ϕg_true_vec)
-ϕ_ini = ζ = vcat(ϕcg_true[[:μP, :ϕg]] .* 1.2, ϕcg_true[[:unc]]);
-ϕ_ini0 = ζ = vcat(ϕcg_true[:μP] .* 0.0, SimpleChains.init_params(g), ϕunc0);
+ϕ_true = inverse_ca(trans_gu, ϕt_true)
+ϕ_ini = ζ = vcat(ϕ_true[[:μP, :ϕg]] .* 1.2, ϕ_true[[:unc]]); # slight disturbance
+ϕ_ini0 = ζ = vcat(ϕ_true[:μP] .* 0.0, ϕg0, ϕunc0); # scratch
 
-neg_elbo_transnorm_gf(rng, g, f, ϕcg_true, y_o[:, 1:n_batch], x_o[:, 1:n_batch],
-    transPMs_batch, map(get_concrete, interpreters);
-    n_MC=8, logσ2y)
-Zygote.gradient(ϕ -> neg_elbo_transnorm_gf(
-        rng, g, f, ϕ, y_o[:, 1:n_batch], x_o[:, 1:n_batch],
-        transPMs_batch, interpreters; n_MC=8, logσ2y), ϕcg_true)
+# test cost function and gradient
+() -> begin
+    neg_elbo_transnorm_gf(rng, g, f, ϕ_true, y_o[:, 1:n_batch], xM[:, 1:n_batch],
+        transPMs_batch, map(get_concrete, interpreters);
+        n_MC = 8, logσ2y)
+    Zygote.gradient(
+        ϕ -> neg_elbo_transnorm_gf(
+            rng, g, f, ϕ, y_o[:, 1:n_batch], xM[:, 1:n_batch],
+            transPMs_batch, interpreters; n_MC = 8, logσ2y),
+        CA.getdata(ϕ_true))
+end
 
+# optimize using SimpleChains
 () -> begin
-    train_loader = MLUtils.DataLoader((x_o, y_o), batchsize = n_batch)
+    train_loader = MLUtils.DataLoader((xM, y_o), batchsize = n_batch)
 
-    optf = Optimization.OptimizationFunction((ζg, data) -> begin
-            x_o, y_o = data
+    optf = Optimization.OptimizationFunction(
+        (ζg, data) -> begin
+            xM, y_o = data
             neg_elbo_transnorm_gf(
-                rng, g, f, ζg, y_o, x_o, transPMs_batch, map(get_concrete, interpreters_g); n_MC=5, logσ2y)
+                rng, g, f, ζg, y_o, xM, transPMs_batch,
+                map(get_concrete, interpreters_g); n_MC = 5, logσ2y)
         end,
         Optimization.AutoZygote())
-    optprob = Optimization.OptimizationProblem(optf, CA.getdata(ϕ_ini), train_loader);
-    res = Optimization.solve(optprob, Optimisers.Adam(0.02), callback=callback_loss(50), maxiters=800);
+    optprob = Optimization.OptimizationProblem(optf, CA.getdata(ϕ_ini), train_loader)
+    res = Optimization.solve(
+        optprob, Optimisers.Adam(0.02), callback = callback_loss(50), maxiters = 800)
     #optprob = Optimization.OptimizationProblem(optf, ϕ_ini0);
     #res = Optimization.solve(optprob, Adam(0.02), callback=callback_loss(50), maxiters=1_400);
 end
-        
-#using Lux
-ϕ = ϕcg_true |> gpu;
-x_o_gpu = x_o |> gpu;
-# y_o = y_o |> gpu
-# logσ2y = logσ2y |> gpu
-n_covar = size(x_o, 1)
-g_flux = Flux.Chain(
-    # dense layer with bias that maps to 8 outputs and applies `tanh` activation
-    Flux.Dense(n_covar => n_covar * 4, tanh),
-    Flux.Dense(n_covar * 4 => n_covar * 4, logistic),
-    # dense layer without bias that maps to n outputs and `identity` activation
-    Flux.Dense(n_covar * 4 => n_θM, identity, bias=false),
-)
+
+ϕ = ϕ_true |> Flux.gpu;
+xM_gpu = xM |> Flux.gpu;
+g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario);
+
+# otpimize using LUX
 () -> begin
     using Lux
     g_lux = Lux.Chain(
@@ -203,59 +187,56 @@ g_flux = Flux.Chain(
         Lux.Dense(n_covar => n_covar * 4, tanh),
         Lux.Dense(n_covar * 4 => n_covar * 4, logistic),
         # dense layer without bias that maps to n outputs and `identity` activation
-        Lux.Dense(n_covar * 4 => n_θM, identity, use_bias=false),
+        Lux.Dense(n_covar * 4 => n_θM, identity, use_bias = false)
     )
     ps, st = Lux.setup(Random.default_rng(), g_lux)
     ps_ca = CA.ComponentArray(ps) |> gpu
     st = st |> gpu
     g_luxs = StatefulLuxLayer{true}(g_lux, nothing, st)
-    g_luxs(x_o_gpu[:, 1:n_batch], ps_ca)
+    g_luxs(xM_gpu[:, 1:n_batch], ps_ca)
     ax_g = CA.getaxes(ps_ca)
-    g_luxs(x_o_gpu[:, 1:n_batch], CA.ComponentArray(ϕ.ϕg, ax_g))
+    g_luxs(xM_gpu[:, 1:n_batch], CA.ComponentArray(ϕ.ϕg, ax_g))
     interpreters = (interpreters..., ϕg = ComponentArrayInterpreter(ps_ca))
     ϕg = CA.ComponentArray(ϕ.ϕg, ax_g)
     ϕgc = interpreters.ϕg(ϕ.ϕg)
-    g_gpu = g_luxs
+    g_flux = g_luxs
 end
-g_gpu = g_flux
-
-#Zygote.gradient(ϕg -> sum(g_gpu(x_o_gpu[:, 1:n_batch],ϕg)), ϕgc)
-# Zygote.gradient(ϕg -> sum(compute_g(g_gpu, x_o_gpu[:, 1:n_batch], ϕg, interpreters)), ϕ.ϕg)
-# Zygote.gradient(ϕ -> sum(tmp_gen1(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), ϕ.ϕg)
-# Zygote.gradient(ϕ -> sum(tmp_gen2(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), CA.getdata(ϕ))
-# Zygote.gradient(ϕ -> sum(tmp_gen2(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), ϕ) |> cpu
-# Zygote.gradient(ϕ -> sum(tmp_gen3(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)), ϕ) |> cpu
-# Zygote.gradient(ϕ -> sum(tmp_gen4(g_gpu, x_o_gpu[:, 1:n_batch], ϕ, interpreters)[1]), ϕ) |> cpu
-# generate_ζ(rng, g_gpu, f, ϕ, x_o_gpu[:, 1:n_batch], interpreters)
-# Zygote.gradient(ϕ -> sum(generate_ζ(rng, g_gpu, f, ϕ, x_o_gpu[:, 1:n_batch], interpreters)[1]), ϕ) |> cpu
-# include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss
-# neg_elbo_transnorm_gf(rng, g_gpu, f, ϕ, y_o[:, 1:n_batch], 
-#     x_o_gpu[:, 1:n_batch], transPMs_batch, interpreters; logσ2y)
-# Zygote.gradient(ϕ -> sum(neg_elbo_transnorm_gf(rng, g_gpu, f, ϕ, y_o[:, 1:n_batch], 
-# x_o_gpu[:, 1:n_batch], transPMs_batch, interpreters; logσ2y)[1]), ϕ) |> cpu
-
-
-fcost(ϕ) = neg_elbo_transnorm_gf(rng, g_gpu, f, ϕ, y_o[:, 1:n_batch], 
-    x_o_gpu[:, 1:n_batch], transPMs_batch, map(get_concrete, interpreters);
-    n_MC=8, logσ2y = logσ2y)
-fcost(ϕ)
-gr = Zygote.gradient(fcost, ϕ) |> cpu;
-Zygote.gradient(fcost, CA.getdata(ϕ))
 
+function fcost(ϕ)
+    neg_elbo_transnorm_gf(rng, g_flux, f, CA.getdata(ϕ), y_o[:, 1:n_batch],
+        xM_gpu[:, 1:n_batch], transPMs_batch, map(get_concrete, interpreters);
+        n_MC = 8, logσ2y = logσ2y)
+end
+fcost(ϕ)
+Zygote.gradient(fcost, ϕ) |> cpu;
+gr = Zygote.gradient(fcost, CA.getdata(ϕ));
+gr_c = CA.ComponentArray(gr[1], CA.getaxes(ϕ)...)
 
-train_loader = MLUtils.DataLoader((x_o_gpu, y_o), batchsize = n_batch)
+train_loader = MLUtils.DataLoader((xM_gpu, y_o), batchsize = n_batch)
 
-optf = Optimization.OptimizationFunction((ζg, data) -> begin
-        x_o, y_o = data
+optf = Optimization.OptimizationFunction(
+    (ζg, data) -> begin
+        xM, y_o = data
         neg_elbo_transnorm_gf(
-            rng, g_gpu, f, ζg, y_o, x_o, transPMs_batch, map(get_concrete, interpreters_g); n_MC=5, logσ2y)
+            rng, g_flux, f, ζg, y_o, xM, transPMs_batch,
+            map(get_concrete, interpreters_g); n_MC = 5, logσ2y)
     end,
     Optimization.AutoZygote())
-optprob = Optimization.OptimizationProblem(optf, CA.getdata(ϕ_ini) |> gpu, train_loader);
-res = res_gpu = Optimization.solve(optprob, Optimisers.Adam(0.02), callback=callback_loss(50), maxiters=800);
+optprob = Optimization.OptimizationProblem(
+    optf, CA.getdata(ϕ_ini) |> Flux.gpu, train_loader);
+res = res_gpu = Optimization.solve(
+    optprob, Optimisers.Adam(0.02), callback = callback_loss(50), maxiters = 800);
 
-ζ_VIc = interpreters_g.μP_ϕg_unc(res.u |> cpu)
-ζMs_VI = g(x_o, ζ_VIc.ϕg)
+# start from zero 
+() -> begin
+    optprob = Optimization.OptimizationProblem(
+        optf, CA.getdata(ϕ_ini0) |> Flux.gpu, train_loader);
+    res = res_gpu = Optimization.solve(
+            optprob, Optimisers.Adam(0.02), callback = callback_loss(50), maxiters = 4_000);
+end
+
+ζ_VIc = interpreters_g.μP_ϕg_unc(res.u |> Flux.cpu)
+ζMs_VI = g(xM, ζ_VIc.ϕg)
 ϕunc_VI = int_unc(ζ_VIc.unc)
 
 hcat(θP_true, exp.(ζ_VIc.μP))
@@ -266,31 +247,35 @@ hcat(ϕunc, ϕunc_VI) # need to compare to MC sample
 # hard to estimate for original very small theta's but otherwise good
 
 # test predicting correct obs-uncertainty of predictive posterior
+# TODO reuse g_flux rather than g
 n_sample_pred = 200
-intm_PMs_gen = ComponentArrayInterpreter(CA.ComponentVector(; θP,
-    θMs=CA.ComponentMatrix(
-        zeros(n_θM, n_site), first(CA.getaxes(θM)), CA.Axis(i=1:n_sample_pred))))
+intm_PMs_gen = ComponentArrayInterpreter(CA.ComponentVector(; θP = θP_true,
+    θMs = CA.ComponentMatrix(
+        zeros(n_θM, n_site), first(CA.getaxes(θMs_true)), CA.Axis(i = 1:n_sample_pred))))
 
-include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss
-ζs, _ = generate_ζ(rng, g, f, res.u |> cpu, x_o, 
-    (;interpreters..., PMs = intm_PMs_gen); n_MC=n_sample_pred)
+ζs, _ = HVI.generate_ζ(rng, g, f, res.u |> Flux.cpu, xM,
+    (; interpreters..., PMs = intm_PMs_gen); n_MC = n_sample_pred)
 # ζ = ζs[:,1]   
-θsc = stack(ζ -> CA.getdata(CA.ComponentVector(
-        TransformVariables.transform(transPMs_all, ζ))), eachcol(ζs));
-y_pred = stack(map(ζ -> first(predict_y(ζ, f, transPMs_all)), eachcol(ζs)));
+θsc = stack(
+    ζ -> CA.getdata(CA.ComponentVector(
+        TransformVariables.transform(transPMs_all, ζ))),
+    eachcol(ζs));
+y_pred = stack(map(ζ -> first(HVI.predict_y(ζ, f, transPMs_all)), eachcol(ζs)));
 
 size(y_pred)
-σ_o_post = mapslices(std, y_pred; dims=3);
+σ_o_post = mapslices(std, y_pred; dims = 3)[:, :, 1];
 #describe(σ_o_post)
-vcat(σ_o, mean_σ_o_MC, mean(σ_o_post), sqrt(mean(abs2, σ_o_post)))
-mean_y_pred = map(mean, eachslice(y_pred; dims=(1, 2)))
+hcat(σ_o, fill(mean_σ_o_MC, length(σ_o)),
+    mean(σ_o_post, dims = 2), sqrt.(mean(abs2, σ_o_post, dims = 2)))
+mean_y_pred = map(mean, eachslice(y_pred; dims = (1, 2)))
 #describe(mean_y_pred - y_o)
 histogram(vec(mean_y_pred - y_true)) # predictions centered around y_o (or y_true)
 
 # look at θP, θM1 of first site
-intm = ComponentArrayInterpreter(int_θdoubleMM(1:length(int_θdoubleMM)), (n_sample_pred,))
-ζs1c = intm(ζs[1:length(int_θdoubleMM), :])
-vcat(θP_true, θM_true)
+θPM = vcat(θP_true, θMs_true[:, 1])
+intm = ComponentArrayInterpreter(θPM, (n_sample_pred,))
+ζs1c = intm(ζs[1:length(θPM), :])
+θPM
 histogram(exp.(ζs1c[:r0, :]))
 histogram(exp.(ζs1c[:K2, :]))
 histogram(exp.(ζs1c[:r1, :]))
@@ -301,9 +286,3 @@ scatterplot(ζs1c[:r0, :], ζs1c[:K2, :])  # r0 and K also correlated (from θP)
 scatterplot(ζs1c[:r0, :], ζs1c[:K1, :])  # no correlation (modeled independent)
 
 # TODO compare distributions to MC sample
-
-
-
-
-
-
diff --git a/ext/HybridVariationalInferenceFluxExt.jl b/ext/HybridVariationalInferenceFluxExt.jl
index 901a1a9..61b7095 100644
--- a/ext/HybridVariationalInferenceFluxExt.jl
+++ b/ext/HybridVariationalInferenceFluxExt.jl
@@ -1,20 +1,46 @@
-module HybridVariationalInferenceFluxExt 
+module HybridVariationalInferenceFluxExt
 
 using HybridVariationalInference, Flux
 using HybridVariationalInference: HybridVariationalInference as HVI
 
-struct FluxApplicator{RT} <: AbstractModelApplicator 
+struct FluxApplicator{RT} <: AbstractModelApplicator
     rebuild::RT
 end
 
-function HVI.construct_FluxApplicator(m::Chain) 
+function HVI.construct_FluxApplicator(m::Chain)
     _, rebuild = destructure(m)
     FluxApplicator(rebuild)
 end
 
-function HVI.apply_model(app::FluxApplicator, x, ϕ) 
+function HVI.apply_model(app::FluxApplicator, x, ϕ)
     m = app.rebuild(ϕ)
     m(x)
 end
 
+struct FluxGPUDataHandler <: AbstractGPUDataHandler end
+HVI.handle_GPU_data(::FluxGPUDataHandler, x::AbstractArray) = cpu(x)
+
+function __init__()
+    #@info "HybridVariationalInference: setting FluxGPUDataHandler"
+    HVI.set_default_GPUHandler(FluxGPUDataHandler())
+end
+
+function HVI.gen_hybridcase_MLapplicator(case::HVI.DoubleMM.DoubleMMCase, ::Val{:Flux};
+        scenario::NTuple = ())
+    (; n_covar, n_θM) = get_hybridcase_sizes(case; scenario)
+    FloatType = get_hybridcase_FloatType(case; scenario)
+    n_out = n_θM
+    is_using_dropout = :use_dropout ∈ scenario
+    is_using_dropout && error("dropout scenario not supported with Flux yet.")
+    g_chain = Flux.Chain(
+        # dense layer with bias that maps to 8 outputs and applies `tanh` activation
+        Flux.Dense(n_covar => n_covar * 4, tanh),
+        Flux.Dense(n_covar * 4 => n_covar * 4, tanh),
+        # dense layer without bias that maps to n outputs and `identity` activation
+        Flux.Dense(n_covar * 4 => n_out, identity, bias = false)
+    )
+    ϕ, _ = destructure(g_chain)
+    construct_FluxApplicator(g_chain), ϕ
+end
+
 end # module
diff --git a/src/DoubleMM/f_doubleMM.jl b/src/DoubleMM/f_doubleMM.jl
index 31fb2e1..4c92fdd 100644
--- a/src/DoubleMM/f_doubleMM.jl
+++ b/src/DoubleMM/f_doubleMM.jl
@@ -69,5 +69,6 @@ function HybridVariationalInference.gen_hybridcase_synthetic(case::DoubleMMCase,
         y_true,
         y_global_o,
         y_o,
+        σ_o = fill(σ_o, size(y_true,1)),
     )
 end
diff --git a/src/GPUDataHandler.jl b/src/GPUDataHandler.jl
new file mode 100644
index 0000000..64d67e5
--- /dev/null
+++ b/src/GPUDataHandler.jl
@@ -0,0 +1,31 @@
+abstract type AbstractGPUDataHandler end
+
+"""
+    (app::AbstractGPUDataHandler)(x) = handle_GPU_data(app, x)
+
+Callable applied to argument `x`, used to configure the exchange of data between
+GPU and CPU.
+By Default, nothing is done to x.
+Package extensions for Flux and Lux implement overloads for AbstractGPUArray
+that call `cpu(x)` to transfer data on the GPU to CPU. Those package extension
+also use `set_default_GPUHandler()` so that .
+"""
+function handle_GPU_data(::AbstractGPUDataHandler, x::AbstractArray) 
+    x
+end
+
+(app::AbstractGPUDataHandler)(x) = handle_GPU_data(app, x)
+
+
+struct NullGPUDataHandler <: AbstractGPUDataHandler end
+
+handle_GPU_data(::NullGPUDataHandler, x::AbstractArray) = x 
+
+default_GPU_DataHandler = NullGPUDataHandler()
+get_default_GPUHandler() = default_GPU_DataHandler
+function set_default_GPUHandler(handler::AbstractGPUDataHandler) 
+    @info "set_default_GPUHandler: setting default handler to $handler"
+    global default_GPU_DataHandler = handler
+end
+
+
diff --git a/src/HybridVariationalInference.jl b/src/HybridVariationalInference.jl
index 7dee309..1c619e4 100644
--- a/src/HybridVariationalInference.jl
+++ b/src/HybridVariationalInference.jl
@@ -13,7 +13,7 @@ using Zygote  # Zygote.@ignore CUDA.randn
 using BlockDiagonals
 
 export inverse_ca
-include("util._transformvariablesjl")
+include("util_transformvariables.jl")
 
 export ComponentArrayInterpreter, flatten1, get_concrete
 include("ComponentArrayInterpreter.jl")
@@ -22,6 +22,9 @@ export AbstractModelApplicator, construct_SimpleChainsApplicator, construct_Flux
        construct_LuxApplicator
 include("ModelApplicator.jl")
 
+export AbstractGPUDataHandler, NullGPUDataHandler, get_default_GPUHandler
+include("GPUDataHandler.jl")
+
 export AbstractHybridCase, gen_hybridcase_MLapplicator, gen_hybridcase_PBmodel, get_hybridcase_sizes, get_hybridcase_FloatType, gen_hybridcase_synthetic,
        get_hybridcase_par_templates, gen_cov_pred
 include("hybrid_case.jl")
@@ -35,9 +38,13 @@ include("gencovar.jl")
 export callback_loss
 include("util_opt.jl")
 
+export neg_logden_indep_normal, entropy_MvNormal
+include("logden_normal.jl")
+
 #export - all internal
 include("cholesky.jl")
 
+export neg_elbo_transnorm_gf
 include("elbo.jl")
 
 export DoubleMM
diff --git a/src/elbo.jl b/src/elbo.jl
index db0cd97..eb6d4db 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -1,3 +1,55 @@
+"""
+Cost function (ELBO) for hybrid model with batched sites.
+
+It generates n_MC samples for each site, and uses these to compute the
+expected value of the likelihood of observations.
+
+## Arguments
+- rng: random number generator (ignored on CUDA, if ϕ is a AbstactGPUArray)
+- g: machine learnig model
+- f: mechanistic model
+- ϕ: flat vector of parameters 
+  including parameter of f (ϕ_P), of g (ϕ_Ms), and of VI (ϕ_unc),
+  interpreted by interpreters.μP_ϕg_unc and interpreters.PMs
+- y_ob: matrix of observations (n_obs x n_site_batch)
+- x: matrix of covariates (n_cov x n_site_batch)
+- transPMs: Transformations with componets P, Ms, similar to interpreters
+- n_MC: number of MonteCarlo samples from the distribution of parameters to simulate
+  using the mechanistic model f.
+- logσ2y: observation uncertainty (log of the variance)
+"""
+function neg_elbo_transnorm_gf(rng, g, f, ϕ::AbstractVector, y_ob, x::AbstractMatrix,
+    transPMs, interpreters::NamedTuple; 
+    n_MC=3, logσ2y, gpu_data_handler = get_default_GPUHandler())
+    ζ, logdetΣ = generate_ζ(rng, g, f, ϕ, x, interpreters; n_MC)
+    ζ_cpu = gpu_data_handler(ζ) # differentiable fetch to CPU in Flux package extension
+    #ζi = first(eachcol(ζ_cpu))
+    nLy = reduce(+, map(eachcol(ζ_cpu)) do ζi
+        y_pred_i, logjac = predict_y(ζi, f, transPMs)
+        nLy1 = neg_logden_indep_normal(y_ob, y_pred_i, logσ2y)
+        nLy1 - logjac
+    end) / n_MC
+    ent = entropy_MvNormal(size(ζ, 1), logdetΣ)  # defined in logden_normal
+    nLy - ent
+end
+
+function predict_gf(rng, g, f, ϕ::AbstractVector, x::AbstractMatrix,
+    transPMs, interpreters::NamedTuple; 
+    n_MC=3, logσ2y, gpu_data_handler = get_default_GPUHandler())
+    ζ, logdetΣ = generate_ζ(rng, g, f, ϕ, x, interpreters; n_MC)
+    ζ_cpu = gpu_data_handler(ζ) # differentiable fetch to CPU in Flux package extension
+    #ζi = first(eachcol(ζ_cpu))
+    nLy = reduce(+, map(eachcol(ζ_cpu)) do ζi
+        y_pred_i, logjac = predict_y(ζi, f, transPMs)
+        nLy1 = neg_logden_indep_normal(y_ob, y_pred_i, logσ2y)
+        nLy1 - logjac
+    end) / n_MC
+    ent = entropy_MvNormal(size(ζ, 1), logdetΣ)  # defined in logden_normal
+    nLy - ent
+end
+
+
+
 """
 Generate samples of (inv-transformed) model parameters, ζ, and Log-Determinant
 of their distribution.
@@ -93,4 +145,25 @@ function _create_random(rng, ::GPUArraysCore.AbstractGPUVector{T}, dims...) wher
     # https://discourse.julialang.org/t/help-using-cuda-zygote-and-random-numbers/123458/4?u=bgctw
     # Zygote.@ignore CUDA.randn(rng, dims...)
     Zygote.@ignore CUDA.randn(dims...)
-end
\ No newline at end of file
+end
+
+""" 
+Compute predictions and log-Determinant of the transformation at given
+transformed parameters for each site. 
+
+The number of sites is given by the number of columns in `Ms`, which is determined
+by the transformation, `transPMs`.
+
+Steps:
+- transform the parameters to original constrained space
+- Applies the mechanistic model for each site
+"""
+function predict_y(ζi, f, transPMs)
+    θtup, logjac = transform_and_logjac(transPMs, ζi) # both allocating
+    θc = CA.ComponentVector(θtup)
+    # TODO provide xP
+    xP = fill((), size(θc.Ms,2))
+    y_pred_global, y_pred = f(θc.P, θc.Ms, xP) # TODO parallelize on CPU
+    # TODO take care of y_pred_global
+    y_pred, logjac
+end
diff --git a/src/logden_normal.jl b/src/logden_normal.jl
new file mode 100644
index 0000000..b4d4dfe
--- /dev/null
+++ b/src/logden_normal.jl
@@ -0,0 +1,62 @@
+"""
+    neg_logden_indep_normal(obs, μ, logσ2s; σfac=1.0) 
+
+Compute the negative Log-density of `θM` for multiple independent normal distributions,
+given estimated means `μ` and estimated log of variance parameters `logσ2s`.
+
+All the arguments should be vectors of the same length.
+If `obs`,  `μ` are given as a matrix of several column-vectors, their summed
+Likelihood is computed, assuming each column having the same `logσ2s`.
+
+Keyword argument `σfac` can be increased to put more weight on achieving
+a low uncertainty estimate and means closer to the observations to help
+an initial fit. The obtained parameters then can be used as starting values
+for a the proper fit with `σfac=1.0`.
+"""
+function neg_logden_indep_normal(obs::AbstractVector, μ::AbstractVector, logσ2::AbstractVector; σfac=1.0)
+    # log of independent Normal distributions 
+    # estimate independent uncertainty of each θM, rather than full covariance
+    #nlogL = sum(σfac .* log.(σs) .+ 1 / 2 .* abs2.((obs .- μ) ./ σs))
+    # problems with division by zero, better constrain log(σs) see Kendall17
+    # s = log.(σs)
+    # nlogL = sum(σfac .* s .+ 1 / 2 .* abs2.((obs .- μ) ./ exp.(s)))
+    # optimize argument logσ2 rather than σs for performance
+    #nlogL = sum(σfac .* (1/2) .* logσ2 .+ (1/2) .* exp.(.- logσ2) .* abs2.(obs .- μ))
+    # specifying logσ2 instead of σ is not transforming a random variable -> no Jacobian
+    nlogL = sum(σfac .* logσ2 .+  abs2.(obs .- μ) .* exp.(.-logσ2)) / 2
+    return (nlogL)
+end
+function neg_logden_indep_normal(obss::AbstractMatrix, preds::AbstractMatrix, logσ2::AbstractVector; kwargs...)
+    nlogLs = map(eachcol(obss), eachcol(preds)) do obs, μ
+        neg_logden_indep_normal(obs, μ, logσ2; kwargs...)
+    end
+    nlogL = sum(nlogLs)
+    return nlogL
+end
+
+function neg_logden_indep_normal(obss::AbstractMatrix, preds::AbstractMatrix, logσ2s::AbstractMatrix; kwargs...)
+    nlogLs = map(eachcol(obss), eachcol(preds), eachcol(logσ2s)) do obs, μ, logσ2
+        neg_logden_indep_normal(obs, μ, logσ2; kwargs...)
+    end
+    nlogL = sum(nlogLs)
+    return nlogL
+end
+
+entropy_MvNormal(K, logdetΣ) = (K*(1+log(2π)) + logdetΣ)/2
+entropy_MvNormal(Σ) = entropy_MvNormal(size(Σ,1), logdet(Σ))
+
+# struct MvNormalLogDensityProblem{T}
+#     y::AbstractVector{T}
+#     σfac::T
+# end
+
+# function MvNormalLogDensityProblem(y::AbstractVector{T}; σfac=1) where {T}
+#     MvNormalLogDensityProblem(y, T(σfac))
+# end
+
+# # define a callable that unpacks parameters, and evaluates the log likelihood
+# function (problem::MvNormalLogDensityProblem)(θ)
+#     μ, logσ2 = θ
+#     -sum(problem.σfac .* logσ2 .+  abs2.(problem.y .- μ) .* exp.(.-logσ2)) / 2
+# end
+
diff --git a/src/util._transformvariablesjl b/src/util_transformvariables.jl
similarity index 100%
rename from src/util._transformvariablesjl
rename to src/util_transformvariables.jl
diff --git a/test/Project.toml b/test/Project.toml
index 97ffeaf..c4d7183 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -2,6 +2,7 @@
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -17,3 +18,4 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TransformVariables = "84d833dd-6860-57f9-a1a7-6da5db126cff"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
diff --git a/test/runtests.jl b/test/runtests.jl
index 18434b3..50635e6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,12 +9,16 @@ const GROUP = get(ENV, "GROUP", "All") # defined in in CI.yml
         @time @safetestset "test_gencovar" include("test_gencovar.jl")
         #@safetestset "test" include("test/test_SimpleChains.jl")
         @time @safetestset "test_SimpleChains" include("test_SimpleChains.jl")
+        #@safetestset "test" include("test/test_logden_normal.jl")
+        @time @safetestset "test_logden_normal" include("test_logden_normal.jl")
         #@safetestset "test" include("test/test_doubleMM.jl")
         @time @safetestset "test_doubleMM" include("test_doubleMM.jl")
         #@safetestset "test" include("test/test_cholesky_structure.jl")
         @time @safetestset "test_cholesky_structure" include("test_cholesky_structure.jl")
         #@safetestset "test" include("test/test_sample_zeta.jl")
         @time @safetestset "test_sample_zeta" include("test_sample_zeta.jl")
+        #@safetestset "test" include("test/test_elbo.jl")
+        @time @safetestset "test_elbo" include("test_elbo.jl")
         #
         #@safetestset "test" include("test/test_Flux.jl")
         @time @safetestset "test_Flux" include("test_Flux.jl")
diff --git a/test/test_Flux.jl b/test/test_Flux.jl
index 54c3190..a3ba11b 100644
--- a/test/test_Flux.jl
+++ b/test/test_Flux.jl
@@ -1,16 +1,37 @@
-using HybridVariationalInference
 using Test
-using Flux
 using StatsFuns: logistic
 using CUDA, GPUArraysCore
 
+using HybridVariationalInference
+# @testset "get_default_GPUHandler before loading Flux" begin
+#     # during overall package testing Flux could be loaded beforehand
+#     h = get_default_GPUHandler()
+#     @test h isa NullGPUDataHandler
+#     x = CuArray(1:5)
+#     xh = h(x)
+#     @test xh === x
+# end;
+
+using Flux
+@testset "get_default_GPUHandler after loading Flux" begin
+    # difficult to  access type in ext
+    # HybridVariationalInferenceFluxExt.FluxGPUDataHandler
+    #typeof(HybridVariationalInference.default_GPU_DataHandler)
+    h = get_default_GPUHandler()
+    @test !(h isa NullGPUDataHandler)
+    x = CuArray(1:5)
+    xh = h(x)
+    @test xh isa Vector
+end;
+
+
 @testset "FluxModelApplicator" begin
     n_covar = 5
     n_out = 2
     g_chain = Chain(
         Dense(n_covar => n_covar * 4, tanh),
         Dense(n_covar * 4 => n_covar * 4, tanh),
-        Dense(n_covar * 4 => n_out, logistic, bias=false),
+        Dense(n_covar * 4 => n_out, identity, bias=false),
     );
     g = construct_FluxApplicator(g_chain)
     n_site = 3
@@ -26,3 +47,4 @@ using CUDA, GPUArraysCore
     #@test ϕ isa GPUArraysCore.AbstractGPUArray
     @test size(y) == (n_out, n_site)
 end;
+
diff --git a/test/test_elbo.jl b/test/test_elbo.jl
index 62064f7..735a960 100644
--- a/test/test_elbo.jl
+++ b/test/test_elbo.jl
@@ -1,16 +1,16 @@
 #using LinearAlgebra, BlockDiagonals
 
 using Test
-using Zygote
 using HybridVariationalInference
 using HybridVariationalInference: HybridVariationalInference as CP
 using StableRNGs
-using CUDA
-using GPUArraysCore: GPUArraysCore
 using Random
 using SimpleChains
 using ComponentArrays: ComponentArrays as CA
 using TransformVariables
+using Zygote
+using CUDA
+using GPUArraysCore: GPUArraysCore
 
 #CUDA.device!(4)
 rng = StableRNG(111)
@@ -25,61 +25,62 @@ f = gen_hybridcase_PBmodel(case; scenario)
 
 (; n_covar, n_site, n_batch, n_θM, n_θP) = get_hybridcase_sizes(case; scenario)
 
-(; xM, θP_true, θMs_true, xP, y_global_true, y_true, y_global_o, y_o
+(; xM, θP_true, θMs_true, xP, y_global_true, y_true, y_global_o, y_o, σ_o
 ) = gen_hybridcase_synthetic(case, rng; scenario);
 
 # correlation matrices
-ρsP = zeros(sum(1:(n_θP-1)))
-ρsM = zeros(sum(1:(n_θM-1)))
+ρsP = zeros(sum(1:(n_θP - 1)))
+ρsM = zeros(sum(1:(n_θM - 1)))
 
 () -> begin
     coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951]
-    logσ2_logP = CA.ComponentVector(r0=-8.997, K2=-5.893)
+    logσ2_logP = CA.ComponentVector(r0 = -8.997, K2 = -5.893)
     #mean_σ_o_MC = 0.006042
 
     ϕunc = CA.ComponentVector(;
-        logσ2_logP=logσ2_logP,
-        coef_logσ2_logMs=coef_logσ2_logMs,
+        logσ2_logP = logσ2_logP,
+        coef_logσ2_logMs = coef_logσ2_logMs,
         ρsP,
         ρsM)
 end
 
-
 # for a conservative uncertainty assume σ2=1e-10 and no relationship with magnitude
+logσ2y = 2 .* log.(σ_o)
 ϕunc0 = CA.ComponentVector(;
-    logσ2_logP=fill(-10.0, n_θP),
-    coef_logσ2_logMs=reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
+    logσ2_logP = fill(-10.0, n_θP),
+    coef_logσ2_logMs = reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
     ρsP,
     ρsM)
 #int_unc = ComponentArrayInterpreter(ϕunc0)
 
 transPMs_batch = as(
-    (P=as(Array, asℝ₊, n_θP),
-    Ms=as(Array, asℝ₊, n_θM, n_batch)))
+    (P = as(Array, asℝ₊, n_θP),
+    Ms = as(Array, asℝ₊, n_θM, n_batch)))
 transPMs_all = as(
-    (P=as(Array, asℝ₊, n_θP),
-    Ms=as(Array, asℝ₊, n_θM, n_site)))
-    
+    (P = as(Array, asℝ₊, n_θP),
+    Ms = as(Array, asℝ₊, n_θM, n_site)))
+
 ϕ_true = θ = CA.ComponentVector(;
-    μP=θP_true,
-    ϕg=ϕg0, #ϕg_opt,  # here start from randomized
-    unc=ϕunc);
+    μP = θP_true,
+    ϕg = ϕg0, #ϕg_opt,  # here start from randomized
+    unc = ϕunc0);
 trans_gu = as(
-    (μP=as(Array, asℝ₊, n_θP),
-    ϕg=as(Array, length(ϕg0)),
-    unc=as(Array, length(ϕunc))))
+    (μP = as(Array, asℝ₊, n_θP),
+    ϕg = as(Array, length(ϕg0)),
+    unc = as(Array, length(ϕunc0))))
 trans_g = as(
-    (μP=as(Array, asℝ₊, n_θP),
-    ϕg=as(Array, length(ϕg0))))
-
-int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP_true,
-    θMs=CA.ComponentMatrix(
-        zeros(n_θM, n_batch), first(CA.getaxes(θMs_true)), CA.Axis(i=1:n_batch))))
-
-interpreters = map(get_concrete,(; 
-    μP_ϕg_unc=ComponentArrayInterpreter(ϕ_true), 
-    PMs=int_PMs_batch,
-    unc=ComponentArrayInterpreter(ϕunc0)
+    (μP = as(Array, asℝ₊, n_θP),
+    ϕg = as(Array, length(ϕg0))))
+
+int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP = θP_true,
+    θMs = CA.ComponentMatrix(
+        zeros(n_θM, n_batch), first(CA.getaxes(θMs_true)), CA.Axis(i = 1:n_batch))))
+
+interpreters = map(get_concrete,
+    (;
+        μP_ϕg_unc = ComponentArrayInterpreter(ϕ_true),
+        PMs = int_PMs_batch,
+        unc = ComponentArrayInterpreter(ϕunc0)
     ))
 
 ϕg_true_vec = CA.ComponentVector(
@@ -88,22 +89,64 @@ interpreters = map(get_concrete,(;
 ϕ_ini = ζ = vcat(ϕcg_true[[:μP, :ϕg]] .* 1.2, ϕcg_true[[:unc]]);
 ϕ_ini0 = ζ = vcat(ϕcg_true[:μP] .* 0.0, ϕg0, ϕunc0);
 
-neg_elbo_transnorm_gf(rng, g, f, ϕcg_true, y_o[:, 1:n_batch], xM[:, 1:n_batch],
-    transPMs_batch, map(get_concrete, interpreters);
-    n_MC=8, logσ2y)
-Zygote.gradient(ϕ -> neg_elbo_transnorm_gf(
-        rng, g, f, ϕ, y_o[:, 1:n_batch], x_o[:, 1:n_batch],
-        transPMs_batch, interpreters; n_MC=8, logσ2y), ϕcg_true)
-
-    @testset "generate_ζ" begin
-        ϕ = CA.getdata(ϕ_cpu)
-        n_sample_pred = 200
-        intm_PMs_gen = ComponentArrayInterpreter(CA.ComponentVector(; θP_true,
-            θMs=CA.ComponentMatrix(
-                zeros(n_θM, n_site), first(CA.getaxes(θMs_true)), CA.Axis(i=1:n_sample_pred))))
-        int_μP_ϕg_unc=ComponentArrayInterpreter(ϕ_true)
-        interpreters = (; PMs = intm_PMs_gen, μP_ϕg_unc = int_μP_ϕg_unc  )
-        ζs, _ = CP.generate_ζ(rng, g, f, ϕ, xM, interpreters; n_MC=n_sample_pred)
-        
-    end;
+@testset "generate_ζ" begin
+    ζ, logdetΣ = CP.generate_ζ(
+        rng, g, f, ϕcg_true, xM[:, 1:n_batch], map(get_concrete, interpreters);
+        n_MC = 8)
+    @test ζ isa Matrix
+    gr = Zygote.gradient(
+        ϕ -> sum(CP.generate_ζ(
+            rng, g, f, ϕ, xM[:, 1:n_batch], map(get_concrete, interpreters);
+            n_MC = 8)[1]),
+        CA.getdata(ϕcg_true))
+    @test gr[1] isa Vector
+end;
+
+# setup g as FluxNN on gpu
+using Flux
+FluxMLengine = Val(nameof(Flux))
+g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario)
+
+@testset "generate_ζ gpu" begin
+    ϕ = CuArray(CA.getdata(ϕcg_true))
+    xMg_batch = CuArray(xM[:, 1:n_batch])
+    ζ, logdetΣ = CP.generate_ζ(
+        rng, g_flux, f, ϕ, xMg_batch, map(get_concrete, interpreters);
+        n_MC = 8)
+    @test ζ isa CuMatrix
+    gr = Zygote.gradient(
+        ϕ -> sum(CP.generate_ζ(
+            rng, g_flux, f, ϕ, xMg_batch, map(get_concrete, interpreters);
+            n_MC = 8)[1]),
+        ϕ)
+    @test gr[1] isa CuVector
+end;
+
+@testset "neg_elbo_transnorm_gf cpu" begin
+    cost = neg_elbo_transnorm_gf(rng, g, f, ϕcg_true, y_o[:, 1:n_batch], xM[:, 1:n_batch],
+        transPMs_batch, map(get_concrete, interpreters);
+        n_MC = 8, logσ2y)
+    @test cost isa Float64
+    gr = Zygote.gradient(
+        ϕ -> neg_elbo_transnorm_gf(
+            rng, g, f, ϕ, y_o[:, 1:n_batch], xM[:, 1:n_batch],
+            transPMs_batch, interpreters; n_MC = 8, logσ2y),
+        CA.getdata(ϕcg_true))
+    @test gr[1] isa Vector
+end;
+
+@testset "neg_elbo_transnorm_gf gpu" begin
+    ϕ = CuArray(CA.getdata(ϕcg_true))
+    xMg_batch = CuArray(xM[:, 1:n_batch])
+    cost = neg_elbo_transnorm_gf(rng, g_flux, f, ϕ, y_o[:, 1:n_batch], xMg_batch,
+        transPMs_batch, map(get_concrete, interpreters);
+        n_MC = 8, logσ2y)
+    @test cost isa Float64
+    gr = Zygote.gradient(
+        ϕ -> neg_elbo_transnorm_gf(
+            rng, g_flux, f, ϕ, y_o[:, 1:n_batch], xMg_batch,
+            transPMs_batch, interpreters; n_MC = 8, logσ2y),
+        ϕ)
+    @test gr[1] isa CuVector
+end;
 
diff --git a/test/test_logden_normal.jl b/test/test_logden_normal.jl
new file mode 100644
index 0000000..91147c4
--- /dev/null
+++ b/test/test_logden_normal.jl
@@ -0,0 +1,42 @@
+using Test
+using Distributions
+using Test
+using HybridVariationalInference
+using HybridVariationalInference: HybridVariationalInference as CP
+using LinearAlgebra
+
+
+@testset "neg_logden_indep_normal vector" begin
+    loglik_norm(y, μ, σ) = -1 / 2 .* (Distributions.log2π .+ 2 .* log.(σ) .+ abs2.(y .- μ) ./ abs2.(σ))
+    loglik_norm_l(y, μ, logσ) = -1 / 2 .* (Distributions.log2π .+ 2 .* logσ .+ abs2.(y .- μ) ./ abs2.(exp.(logσ)))
+    logden_norm_l(y, μ, logσ) = -1 / 2 .* (2 .* logσ .+ abs2.(y .- μ) ./ abs2.(exp.(logσ)))
+    neg_logden_norm_l2(y, μ, logσ2) = (logσ2 .+ abs2.(y .- μ) .* exp.(-logσ2)) ./ 2
+
+    # first test that neg_logden_norm_l2 retuns values of logpdf(Normal) up to an additive 
+    μ = [1.0, 1.0]
+    σ = [1.1, 2.0]
+    logσ2 = log.(abs2.(σ))
+    y = [1.2, 1.1]
+    tmp_true = logpdf.(Normal.(μ, σ), y)
+    dlogpdf = tmp_true .- tmp_true[1]
+    #loglik_norm(y, μ, σ)
+    #loglik_norm_l(y, μ, log.(σ))
+    #tmp = logden_norm_l(y, μ, log.(σ))
+    #tmp .- tmp[1]
+    tmp = neg_logden_norm_l2(y, μ, logσ2)
+    @test isapprox(tmp .- tmp[1], -dlogpdf)
+
+    # next test that the sum of neg_logden_norm_l2 corresponds to 
+    @test neg_logden_indep_normal(y, μ, logσ2) ≈ sum(tmp)
+end;
+
+@testset "entropy_MvNormal" begin
+    S = Diagonal([4,5]) .+ rand(2,2)
+    S2 = Symmetric(S*S)
+    @test entropy_MvNormal(S2) == entropy(MvNormal(S2))
+end;
+
+
+
+
+

From 4d4da56c5c6f6ce64aca122828f802e2f8cfadfe Mon Sep 17 00:00:00 2001
From: Thomas Wutzler <twutz@bgc-jena.mpg.de>
Date: Thu, 9 Jan 2025 19:21:24 +0100
Subject: [PATCH 3/6] encapsulate generating transformations and interpreters

implement predict_gf function
---
 dev/doubleMM.jl                   | 179 +++++++++++++++---------------
 src/HybridVariationalInference.jl |   5 +-
 src/elbo.jl                       |  45 ++++----
 src/init_hybrid_params.jl         |  69 ++++++++++++
 test/test_cholesky_structure.jl   |   4 +-
 test/test_elbo.jl                 | 148 ++++++++++++++----------
 6 files changed, 277 insertions(+), 173 deletions(-)
 create mode 100644 src/init_hybrid_params.jl

diff --git a/dev/doubleMM.jl b/dev/doubleMM.jl
index 7f2a013..a81d437 100644
--- a/dev/doubleMM.jl
+++ b/dev/doubleMM.jl
@@ -53,8 +53,8 @@ scatterplot(vec(θMs_true), vec(loss_g(ϕg_opt1, xM, g)[2]))
 
 f = gen_hybridcase_PBmodel(case; scenario)
 
+#----------- fit g and θP to y_o
 () -> begin
-    #----------- fit g and θP to y_o
     # end2end inversion
 
     int_ϕθP = ComponentArrayInterpreter(CA.ComponentVector(
@@ -82,67 +82,78 @@ f = gen_hybridcase_PBmodel(case; scenario)
 end
 
 #---------- HVI
-# TODO think about good general initializations
-coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951]
-logσ2_logP = CA.ComponentVector(r0 = -8.997, K2 = -5.893)
-mean_σ_o_MC = 0.006042
-
-# correlation matrices
-ρsP = zeros(sum(1:(n_θP - 1)))
-ρsM = zeros(sum(1:(n_θM - 1)))
-
-ϕunc = CA.ComponentVector(;
-    logσ2_logP = logσ2_logP,
-    coef_logσ2_logMs = coef_logσ2_logMs,
-    ρsP,
-    ρsM)
-int_unc = ComponentArrayInterpreter(ϕunc)
-
-# for a conservative uncertainty assume σ2=1e-10 and no relationship with magnitude
-ϕunc0 = CA.ComponentVector(;
-    logσ2_logP = fill(-10.0, n_θP),
-    coef_logσ2_logMs = reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
-    ρsP,
-    ρsM)
-
 logσ2y = 2 .* log.(σ_o)
 n_MC = 3
+(; ϕ, transPMs_batch, interpreters, get_transPMs, get_ca_int_PMs) = init_hybrid_params(
+    θP_true, θMs_true[:, 1], ϕg_opt1, n_batch; transP = asℝ₊, transM = asℝ₊);
+ϕ_true = ϕ
 
-transPMs_batch = as(
-    (P = as(Array, asℝ₊, n_θP),
-    Ms = as(Array, asℝ₊, n_θM, n_batch)))
-transPMs_all = as(
-    (P = as(Array, asℝ₊, n_θP),
-    Ms = as(Array, asℝ₊, n_θM, n_site)))
-
-n_ϕg = length(ϕg_opt1)
-ϕt_true = θ = CA.ComponentVector(;
-    μP = θP_true,
-    ϕg = ϕg_opt1,
-    unc = ϕunc);
-trans_gu = as(
-    (μP = as(Array, asℝ₊, n_θP),
-    ϕg = as(Array, n_ϕg),
-    unc = as(Array, length(ϕunc))))
-trans_g = as(
-    (μP = as(Array, asℝ₊, n_θP),
-    ϕg = as(Array, n_ϕg)))
-
-#const 
-int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP = θP_true,
-    θMs = CA.ComponentMatrix(
-        zeros(n_θM, n_batch), first(CA.getaxes(θMs_true)), CA.Axis(i = 1:n_batch))))
-
-interpreters = interpreters_g = map(get_concrete,
-    (;
-        μP_ϕg_unc = ComponentArrayInterpreter(ϕt_true),
-        PMs = int_PMs_batch,
-        unc = ComponentArrayInterpreter(ϕunc)
-    ))
-
-ϕ_true = inverse_ca(trans_gu, ϕt_true)
+() -> begin
+    coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951]
+    logσ2_logP = CA.ComponentVector(r0 = -8.997, K2 = -5.893)
+    mean_σ_o_MC = 0.006042
+
+    # correlation matrices
+    ρsP = zeros(sum(1:(n_θP - 1)))
+    ρsM = zeros(sum(1:(n_θM - 1)))
+
+    ϕunc = CA.ComponentVector(;
+        logσ2_logP = logσ2_logP,
+        coef_logσ2_logMs = coef_logσ2_logMs,
+        ρsP,
+        ρsM)
+    int_unc = ComponentArrayInterpreter(ϕunc)
+
+    # for a conservative uncertainty assume σ2=1e-10 and no relationship with magnitude
+    ϕunc0 = CA.ComponentVector(;
+        logσ2_logP = fill(-10.0, n_θP),
+        coef_logσ2_logMs = reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
+        ρsP,
+        ρsM)
+
+    transPMs_batch = as(
+        (P = as(Array, asℝ₊, n_θP),
+        Ms = as(Array, asℝ₊, n_θM, n_batch)))
+    transPMs_allsites = as(
+        (P = as(Array, asℝ₊, n_θP),
+        Ms = as(Array, asℝ₊, n_θM, n_site)))
+
+    n_ϕg = length(ϕg_opt1)
+    ϕt_true = θ = CA.ComponentVector(;
+        μP = θP_true,
+        ϕg = ϕg_opt1,
+        unc = ϕunc)
+    trans_gu = as(
+        (μP = as(Array, asℝ₊, n_θP),
+        ϕg = as(Array, n_ϕg),
+        unc = as(Array, length(ϕunc))))
+    trans_g = as(
+        (μP = as(Array, asℝ₊, n_θP),
+        ϕg = as(Array, n_ϕg)))
+
+    #const 
+    int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP = θP_true,
+        θMs = CA.ComponentMatrix(
+            zeros(n_θM, n_batch), first(CA.getaxes(θMs_true)), CA.Axis(i = 1:n_batch))))
+
+    interpreters = interpreters_g = map(get_concrete,
+        (;
+            μP_ϕg_unc = ComponentArrayInterpreter(ϕt_true),
+            PMs = int_PMs_batch,
+            unc = ComponentArrayInterpreter(ϕunc)
+        ))
+
+    ϕ_true = inverse_ca(trans_gu, ϕt_true)
+end
+
+ϕ_ini0 = ζ = vcat(ϕ_true[:μP] .* 0.0, ϕg0, ϕ_true[[:unc]]); # scratch
+#
+# true values
 ϕ_ini = ζ = vcat(ϕ_true[[:μP, :ϕg]] .* 1.2, ϕ_true[[:unc]]); # slight disturbance
-ϕ_ini0 = ζ = vcat(ϕ_true[:μP] .* 0.0, ϕg0, ϕunc0); # scratch
+# hardcoded from HMC inversion
+ϕ_ini.unc.coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951]
+ϕ_ini.unc.logσ2_logP = CA.ComponentVector(r0 = -8.997, K2 = -5.893)
+mean_σ_o_MC = 0.006042
 
 # test cost function and gradient
 () -> begin
@@ -161,10 +172,10 @@ end
     train_loader = MLUtils.DataLoader((xM, y_o), batchsize = n_batch)
 
     optf = Optimization.OptimizationFunction(
-        (ζg, data) -> begin
+        (ϕ, data) -> begin
             xM, y_o = data
             neg_elbo_transnorm_gf(
-                rng, g, f, ζg, y_o, xM, transPMs_batch,
+                rng, g, f, ϕ, y_o, xM, transPMs_batch,
                 map(get_concrete, interpreters_g); n_MC = 5, logσ2y)
         end,
         Optimization.AutoZygote())
@@ -181,7 +192,7 @@ g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario
 
 # otpimize using LUX
 () -> begin
-    using Lux
+    #using Lux
     g_lux = Lux.Chain(
         # dense layer with bias that maps to 8 outputs and applies `tanh` activation
         Lux.Dense(n_covar => n_covar * 4, tanh),
@@ -208,18 +219,19 @@ function fcost(ϕ)
         n_MC = 8, logσ2y = logσ2y)
 end
 fcost(ϕ)
-Zygote.gradient(fcost, ϕ) |> cpu;
+#Zygote.gradient(fcost, ϕ) |> cpu;
 gr = Zygote.gradient(fcost, CA.getdata(ϕ));
-gr_c = CA.ComponentArray(gr[1], CA.getaxes(ϕ)...)
+gr_c = CA.ComponentArray(gr[1] |> Flux.cpu, CA.getaxes(ϕ)...)
 
 train_loader = MLUtils.DataLoader((xM_gpu, y_o), batchsize = n_batch)
 
 optf = Optimization.OptimizationFunction(
-    (ζg, data) -> begin
+    (ϕ, data) -> begin
         xM, y_o = data
-        neg_elbo_transnorm_gf(
-            rng, g_flux, f, ζg, y_o, xM, transPMs_batch,
-            map(get_concrete, interpreters_g); n_MC = 5, logσ2y)
+        fcost(ϕ)
+        # neg_elbo_transnorm_gf(
+        #     rng, g_flux, f, ϕ, y_o, xM, transPMs_batch,
+        #     map(get_concrete, interpreters); n_MC = 5, logσ2y)
     end,
     Optimization.AutoZygote())
 optprob = Optimization.OptimizationProblem(
@@ -230,40 +242,31 @@ res = res_gpu = Optimization.solve(
 # start from zero 
 () -> begin
     optprob = Optimization.OptimizationProblem(
-        optf, CA.getdata(ϕ_ini0) |> Flux.gpu, train_loader);
+        optf, CA.getdata(ϕ_ini0) |> Flux.gpu, train_loader)
     res = res_gpu = Optimization.solve(
-            optprob, Optimisers.Adam(0.02), callback = callback_loss(50), maxiters = 4_000);
+        optprob, Optimisers.Adam(0.02), callback = callback_loss(50), maxiters = 4_000)
 end
 
-ζ_VIc = interpreters_g.μP_ϕg_unc(res.u |> Flux.cpu)
-ζMs_VI = g(xM, ζ_VIc.ϕg)
-ϕunc_VI = int_unc(ζ_VIc.unc)
+ζ_VIc = interpreters.μP_ϕg_unc(res.u |> Flux.cpu)
+ζMs_VI = g_flux(xM_gpu, ζ_VIc.ϕg |> Flux.gpu) |> Flux.cpu
+ϕunc_VI = interpreters.unc(ζ_VIc.unc)
 
 hcat(θP_true, exp.(ζ_VIc.μP))
 plt = scatterplot(vec(θMs_true), vec(exp.(ζMs_VI)))
 #lineplot!(plt, 0.0, 1.1, identity)
 # 
-hcat(ϕunc, ϕunc_VI) # need to compare to MC sample
+hcat(ϕ_ini.unc, ϕunc_VI) # need to compare to MC sample
 # hard to estimate for original very small theta's but otherwise good
 
 # test predicting correct obs-uncertainty of predictive posterior
 # TODO reuse g_flux rather than g
 n_sample_pred = 200
-intm_PMs_gen = ComponentArrayInterpreter(CA.ComponentVector(; θP = θP_true,
-    θMs = CA.ComponentMatrix(
-        zeros(n_θM, n_site), first(CA.getaxes(θMs_true)), CA.Axis(i = 1:n_sample_pred))))
-
-ζs, _ = HVI.generate_ζ(rng, g, f, res.u |> Flux.cpu, xM,
-    (; interpreters..., PMs = intm_PMs_gen); n_MC = n_sample_pred)
-# ζ = ζs[:,1]   
-θsc = stack(
-    ζ -> CA.getdata(CA.ComponentVector(
-        TransformVariables.transform(transPMs_all, ζ))),
-    eachcol(ζs));
-y_pred = stack(map(ζ -> first(HVI.predict_y(ζ, f, transPMs_all)), eachcol(ζs)));
-
-size(y_pred)
-σ_o_post = mapslices(std, y_pred; dims = 3)[:, :, 1];
+y_pred = predict_gf(rng, g_flux, f, res.u, xM_gpu, interpreters;
+    get_transPMs, get_ca_int_PMs, n_sample_pred);
+size(y_pred) # n_obs x n_site, n_sample_pred
+
+σ_o_post = dropdims(std(y_pred; dims = 3), dims=3)
+
 #describe(σ_o_post)
 hcat(σ_o, fill(mean_σ_o_MC, length(σ_o)),
     mean(σ_o_post, dims = 2), sqrt.(mean(abs2, σ_o_post, dims = 2)))
diff --git a/src/HybridVariationalInference.jl b/src/HybridVariationalInference.jl
index 1c619e4..5f23d33 100644
--- a/src/HybridVariationalInference.jl
+++ b/src/HybridVariationalInference.jl
@@ -44,9 +44,12 @@ include("logden_normal.jl")
 #export - all internal
 include("cholesky.jl")
 
-export neg_elbo_transnorm_gf
+export neg_elbo_transnorm_gf, predict_gf
 include("elbo.jl")
 
+export init_hybrid_params
+include("init_hybrid_params.jl")
+
 export DoubleMM
 include("DoubleMM/DoubleMM.jl")
 
diff --git a/src/elbo.jl b/src/elbo.jl
index eb6d4db..9523994 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -21,34 +21,37 @@ expected value of the likelihood of observations.
 function neg_elbo_transnorm_gf(rng, g, f, ϕ::AbstractVector, y_ob, x::AbstractMatrix,
     transPMs, interpreters::NamedTuple; 
     n_MC=3, logσ2y, gpu_data_handler = get_default_GPUHandler())
-    ζ, logdetΣ = generate_ζ(rng, g, f, ϕ, x, interpreters; n_MC)
-    ζ_cpu = gpu_data_handler(ζ) # differentiable fetch to CPU in Flux package extension
-    #ζi = first(eachcol(ζ_cpu))
-    nLy = reduce(+, map(eachcol(ζ_cpu)) do ζi
+    ζs, logdetΣ = generate_ζ(rng, g, f, ϕ, x, interpreters; n_MC)
+    ζs_cpu = gpu_data_handler(ζs) # differentiable fetch to CPU in Flux package extension
+    #ζi = first(eachcol(ζs_cpu))
+    nLy = reduce(+, map(eachcol(ζs_cpu)) do ζi
         y_pred_i, logjac = predict_y(ζi, f, transPMs)
         nLy1 = neg_logden_indep_normal(y_ob, y_pred_i, logσ2y)
         nLy1 - logjac
     end) / n_MC
-    ent = entropy_MvNormal(size(ζ, 1), logdetΣ)  # defined in logden_normal
-    nLy - ent
-end
-
-function predict_gf(rng, g, f, ϕ::AbstractVector, x::AbstractMatrix,
-    transPMs, interpreters::NamedTuple; 
-    n_MC=3, logσ2y, gpu_data_handler = get_default_GPUHandler())
-    ζ, logdetΣ = generate_ζ(rng, g, f, ϕ, x, interpreters; n_MC)
-    ζ_cpu = gpu_data_handler(ζ) # differentiable fetch to CPU in Flux package extension
-    #ζi = first(eachcol(ζ_cpu))
-    nLy = reduce(+, map(eachcol(ζ_cpu)) do ζi
-        y_pred_i, logjac = predict_y(ζi, f, transPMs)
-        nLy1 = neg_logden_indep_normal(y_ob, y_pred_i, logσ2y)
-        nLy1 - logjac
-    end) / n_MC
-    ent = entropy_MvNormal(size(ζ, 1), logdetΣ)  # defined in logden_normal
+    ent = entropy_MvNormal(size(ζs, 1), logdetΣ)  # defined in logden_normal
     nLy - ent
 end
 
+"""
+    predict_gf(rng, g, f, ϕ::AbstractVector, xM::AbstractMatrix, interpreters;
+        get_transPMs, get_ca_int_PMs, n_sample_pred=200, 
+        gpu_data_handler=get_default_GPUHandler())
 
+Prediction function for hybrid model. Retuns an Array `(n_obs, n_site, n_sample_pred)`.
+"""
+function predict_gf(rng, g, f, ϕ::AbstractVector, xM::AbstractMatrix, interpreters;
+    get_transPMs, get_ca_int_PMs, n_sample_pred=200, 
+    gpu_data_handler=get_default_GPUHandler())
+    n_site = size(xM, 2)
+    intm_PMs_gen = get_ca_int_PMs(n_site)
+    tans_PMs_gen = get_transPMs(n_site)
+    ζs, _ = generate_ζ(rng, g, f, CA.getdata(ϕ), CA.getdata(xM),
+    (; interpreters..., PMs = intm_PMs_gen); n_MC = n_sample_pred)
+    ζs_cpu = gpu_data_handler(ζs) #
+    y_pred = stack(map(ζ -> first(predict_y(ζ, f, tans_PMs_gen)), eachcol(ζs_cpu)));
+    y_pred
+end
 
 """
 Generate samples of (inv-transformed) model parameters, ζ, and Log-Determinant
@@ -144,7 +147,7 @@ function _create_random(rng, ::GPUArraysCore.AbstractGPUVector{T}, dims...) wher
     # ignores rng
     # https://discourse.julialang.org/t/help-using-cuda-zygote-and-random-numbers/123458/4?u=bgctw
     # Zygote.@ignore CUDA.randn(rng, dims...)
-    Zygote.@ignore CUDA.randn(dims...)
+    ChainRulesCore.@ignore_derivatives CUDA.randn(dims...)
 end
 
 """ 
diff --git a/src/init_hybrid_params.jl b/src/init_hybrid_params.jl
new file mode 100644
index 0000000..955d8ba
--- /dev/null
+++ b/src/init_hybrid_params.jl
@@ -0,0 +1,69 @@
+"""
+    init_hybrid_params(θP, θM, ϕg, n_batch; transP=asℝ, transM=asℝ)
+
+Setup ComponentVector of parameters to optimize, and associated tools.
+Returns a NamedTuple of
+- ϕ: A ComponentVector of parameters to optimize
+- transPMs_batch, interpreters: Transformations and interpreters as 
+  required by `neg_elbo_transnorm_gf`.
+- get_transPMs: a function returning transformations `(n_site) -> (;P,Ms)`
+- get_ca_int_PMs: a function returning ComponentArrayInterpreter for PMs vector 
+  with PMs shaped as a matrix of `n_site` columns of `θM`
+
+# Arguments
+- `θP`, `θM`: Template ComponentVectors of global parameters and ML-predicted parameters
+- `ϕg`: vector of parameters to optimize, as returned by `gen_hybridcase_MLapplicator`
+- `n_batch`: the number of sites to predicted in each mini-batch
+- `transP`, `transM`: the Transformations for the global and site-dependent parameters
+"""
+function init_hybrid_params(θP, θM, ϕg, n_batch; transP=asℝ, transM=asℝ)
+    n_θP = length(θP)
+    n_θM = length(θM)
+    n_ϕg = length(ϕg)
+    # zero correlation matrices
+    ρsP = zeros(sum(1:(n_θP - 1)))
+    ρsM = zeros(sum(1:(n_θM - 1)))
+    ϕunc0 = CA.ComponentVector(;
+        logσ2_logP = fill(-10.0, n_θP),
+        coef_logσ2_logMs = reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
+        ρsP,
+        ρsM)
+    ϕt = CA.ComponentVector(;
+        μP = θP,
+        ϕg = ϕg,
+        unc = ϕunc0);
+    #
+    get_transPMs = let transP=transP, transM=transM, n_θP=n_θP, n_θM=n_θM 
+        function get_transPMs_inner(n_site)
+            transPMs = as(
+                (P = as(Array, transP, n_θP),
+                Ms = as(Array, transM, n_θM, n_site)))
+        end
+    end
+    transPMs_batch = get_transPMs(n_batch)
+    trans_gu = as(
+        (μP = as(Array, asℝ₊, n_θP),
+        ϕg = as(Array, n_ϕg),
+        unc = as(Array, length(ϕunc0))))
+    ϕ = inverse_ca(trans_gu, ϕt)        
+    # trans_g = as(
+    #     (μP = as(Array, asℝ₊, n_θP),
+    #     ϕg = as(Array, n_ϕg)))       
+    #
+    get_ca_int_PMs = let 
+        function get_ca_int_PMs_inner(n_site)
+            ComponentArrayInterpreter(CA.ComponentVector(; θP,
+            θMs = CA.ComponentMatrix(
+                zeros(n_θM, n_site), first(CA.getaxes(θM)), CA.Axis(i = 1:n_site))))
+        end
+        
+    end
+    interpreters = map(get_concrete,
+    (;
+        μP_ϕg_unc = ComponentArrayInterpreter(ϕt),
+        PMs = get_ca_int_PMs(n_batch),
+        unc = ComponentArrayInterpreter(ϕunc0)
+    ))
+    (;ϕ, transPMs_batch, interpreters, get_transPMs, get_ca_int_PMs)
+end
+
diff --git a/test/test_cholesky_structure.jl b/test/test_cholesky_structure.jl
index ad188f4..6566716 100644
--- a/test/test_cholesky_structure.jl
+++ b/test/test_cholesky_structure.jl
@@ -247,8 +247,8 @@ end
     #@test Upred ≈ CU
     SUpred = Upred * Dσ
     #hcat(SUpred, SU)  
-    @test SUpred≈SU atol=1e-1
+    @test SUpred≈SU atol=2e-1
     S_pred = Dσ' * Upred' * Upred * Dσ
-    @test S_pred≈S atol=1e-1
+    @test S_pred≈S atol=2e-1
 end
 
diff --git a/test/test_elbo.jl b/test/test_elbo.jl
index 735a960..2fe1676 100644
--- a/test/test_elbo.jl
+++ b/test/test_elbo.jl
@@ -28,77 +28,85 @@ f = gen_hybridcase_PBmodel(case; scenario)
 (; xM, θP_true, θMs_true, xP, y_global_true, y_true, y_global_o, y_o, σ_o
 ) = gen_hybridcase_synthetic(case, rng; scenario);
 
-# correlation matrices
-ρsP = zeros(sum(1:(n_θP - 1)))
-ρsM = zeros(sum(1:(n_θM - 1)))
+logσ2y = 2 .* log.(σ_o)
+n_MC = 3
+(; ϕ, transPMs_batch, interpreters, get_transPMs, get_ca_int_PMs) = init_hybrid_params(
+    θP_true, θMs_true[:, 1], ϕg0, n_batch; transP = asℝ₊, transM = asℝ₊);
+ϕ_ini = ϕ
 
 () -> begin
-    coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951]
-    logσ2_logP = CA.ComponentVector(r0 = -8.997, K2 = -5.893)
-    #mean_σ_o_MC = 0.006042
-
-    ϕunc = CA.ComponentVector(;
-        logσ2_logP = logσ2_logP,
-        coef_logσ2_logMs = coef_logσ2_logMs,
+    # correlation matrices
+    ρsP = zeros(sum(1:(n_θP - 1)))
+    ρsM = zeros(sum(1:(n_θM - 1)))
+
+    () -> begin
+        coef_logσ2_logMs = [-5.769 -3.501; -0.01791 0.007951]
+        logσ2_logP = CA.ComponentVector(r0 = -8.997, K2 = -5.893)
+        #mean_σ_o_MC = 0.006042
+
+        ϕunc = CA.ComponentVector(;
+            logσ2_logP = logσ2_logP,
+            coef_logσ2_logMs = coef_logσ2_logMs,
+            ρsP,
+            ρsM)
+    end
+
+    # for a conservative uncertainty assume σ2=1e-10 and no relationship with magnitude
+    logσ2y = 2 .* log.(σ_o)
+    ϕunc0 = CA.ComponentVector(;
+        logσ2_logP = fill(-10.0, n_θP),
+        coef_logσ2_logMs = reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
         ρsP,
         ρsM)
+    #int_unc = ComponentArrayInterpreter(ϕunc0)
+
+    transPMs_batch = as(
+        (P = as(Array, asℝ₊, n_θP),
+        Ms = as(Array, asℝ₊, n_θM, n_batch)))
+    transPMs_allsites = as(
+        (P = as(Array, asℝ₊, n_θP),
+        Ms = as(Array, asℝ₊, n_θM, n_site)))
+
+    ϕ_true = θ = CA.ComponentVector(;
+        μP = θP_true,
+        ϕg = ϕg0, #ϕg_opt,  # here start from randomized
+        unc = ϕunc0)
+    trans_gu = as(
+        (μP = as(Array, asℝ₊, n_θP),
+        ϕg = as(Array, length(ϕg0)),
+        unc = as(Array, length(ϕunc0))))
+    trans_g = as(
+        (μP = as(Array, asℝ₊, n_θP),
+        ϕg = as(Array, length(ϕg0))))
+
+    int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP = θP_true,
+        θMs = CA.ComponentMatrix(
+            zeros(n_θM, n_batch), first(CA.getaxes(θMs_true)), CA.Axis(i = 1:n_batch))))
+
+    interpreters = map(get_concrete,
+        (;
+            μP_ϕg_unc = ComponentArrayInterpreter(ϕ_true),
+            PMs = int_PMs_batch,
+            unc = ComponentArrayInterpreter(ϕunc0)
+        ))
+
+    ϕg_true_vec = CA.ComponentVector(
+        TransformVariables.inverse(trans_gu, CP.cv2NamedTuple(ϕ_true)))
+    ϕcg_true = interpreters.μP_ϕg_unc(ϕg_true_vec)
+    ϕ_ini = ζ = vcat(ϕcg_true[[:μP, :ϕg]] .* 1.2, ϕcg_true[[:unc]])
+    ϕ_ini0 = ζ = vcat(ϕcg_true[:μP] .* 0.0, ϕg0, ϕunc0)
 end
 
-# for a conservative uncertainty assume σ2=1e-10 and no relationship with magnitude
-logσ2y = 2 .* log.(σ_o)
-ϕunc0 = CA.ComponentVector(;
-    logσ2_logP = fill(-10.0, n_θP),
-    coef_logσ2_logMs = reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
-    ρsP,
-    ρsM)
-#int_unc = ComponentArrayInterpreter(ϕunc0)
-
-transPMs_batch = as(
-    (P = as(Array, asℝ₊, n_θP),
-    Ms = as(Array, asℝ₊, n_θM, n_batch)))
-transPMs_all = as(
-    (P = as(Array, asℝ₊, n_θP),
-    Ms = as(Array, asℝ₊, n_θM, n_site)))
-
-ϕ_true = θ = CA.ComponentVector(;
-    μP = θP_true,
-    ϕg = ϕg0, #ϕg_opt,  # here start from randomized
-    unc = ϕunc0);
-trans_gu = as(
-    (μP = as(Array, asℝ₊, n_θP),
-    ϕg = as(Array, length(ϕg0)),
-    unc = as(Array, length(ϕunc0))))
-trans_g = as(
-    (μP = as(Array, asℝ₊, n_θP),
-    ϕg = as(Array, length(ϕg0))))
-
-int_PMs_batch = ComponentArrayInterpreter(CA.ComponentVector(; θP = θP_true,
-    θMs = CA.ComponentMatrix(
-        zeros(n_θM, n_batch), first(CA.getaxes(θMs_true)), CA.Axis(i = 1:n_batch))))
-
-interpreters = map(get_concrete,
-    (;
-        μP_ϕg_unc = ComponentArrayInterpreter(ϕ_true),
-        PMs = int_PMs_batch,
-        unc = ComponentArrayInterpreter(ϕunc0)
-    ))
-
-ϕg_true_vec = CA.ComponentVector(
-    TransformVariables.inverse(trans_gu, CP.cv2NamedTuple(ϕ_true)))
-ϕcg_true = interpreters.μP_ϕg_unc(ϕg_true_vec)
-ϕ_ini = ζ = vcat(ϕcg_true[[:μP, :ϕg]] .* 1.2, ϕcg_true[[:unc]]);
-ϕ_ini0 = ζ = vcat(ϕcg_true[:μP] .* 0.0, ϕg0, ϕunc0);
-
 @testset "generate_ζ" begin
     ζ, logdetΣ = CP.generate_ζ(
-        rng, g, f, ϕcg_true, xM[:, 1:n_batch], map(get_concrete, interpreters);
+        rng, g, f, ϕ_ini, xM[:, 1:n_batch], map(get_concrete, interpreters);
         n_MC = 8)
     @test ζ isa Matrix
     gr = Zygote.gradient(
         ϕ -> sum(CP.generate_ζ(
             rng, g, f, ϕ, xM[:, 1:n_batch], map(get_concrete, interpreters);
             n_MC = 8)[1]),
-        CA.getdata(ϕcg_true))
+        CA.getdata(ϕ_ini))
     @test gr[1] isa Vector
 end;
 
@@ -108,7 +116,7 @@ FluxMLengine = Val(nameof(Flux))
 g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario)
 
 @testset "generate_ζ gpu" begin
-    ϕ = CuArray(CA.getdata(ϕcg_true))
+    ϕ = CuArray(CA.getdata(ϕ_ini))
     xMg_batch = CuArray(xM[:, 1:n_batch])
     ζ, logdetΣ = CP.generate_ζ(
         rng, g_flux, f, ϕ, xMg_batch, map(get_concrete, interpreters);
@@ -123,7 +131,7 @@ g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario
 end;
 
 @testset "neg_elbo_transnorm_gf cpu" begin
-    cost = neg_elbo_transnorm_gf(rng, g, f, ϕcg_true, y_o[:, 1:n_batch], xM[:, 1:n_batch],
+    cost = neg_elbo_transnorm_gf(rng, g, f, ϕ_ini, y_o[:, 1:n_batch], xM[:, 1:n_batch],
         transPMs_batch, map(get_concrete, interpreters);
         n_MC = 8, logσ2y)
     @test cost isa Float64
@@ -131,12 +139,12 @@ end;
         ϕ -> neg_elbo_transnorm_gf(
             rng, g, f, ϕ, y_o[:, 1:n_batch], xM[:, 1:n_batch],
             transPMs_batch, interpreters; n_MC = 8, logσ2y),
-        CA.getdata(ϕcg_true))
+        CA.getdata(ϕ_ini))
     @test gr[1] isa Vector
 end;
 
 @testset "neg_elbo_transnorm_gf gpu" begin
-    ϕ = CuArray(CA.getdata(ϕcg_true))
+    ϕ = CuArray(CA.getdata(ϕ_ini))
     xMg_batch = CuArray(xM[:, 1:n_batch])
     cost = neg_elbo_transnorm_gf(rng, g_flux, f, ϕ, y_o[:, 1:n_batch], xMg_batch,
         transPMs_batch, map(get_concrete, interpreters);
@@ -150,3 +158,21 @@ end;
     @test gr[1] isa CuVector
 end;
 
+@testset "predict_gf cpu" begin
+    n_sample_pred = 200
+    y_pred = predict_gf(rng, g, f, ϕ_ini, xM, map(get_concrete, interpreters);
+        get_transPMs, get_ca_int_PMs, n_sample_pred)
+    @test y_pred isa Array
+    @test size(y_pred) == (size(y_o)..., n_sample_pred)
+end
+
+@testset "predict_gf gpu" begin
+    n_sample_pred = 200
+    ϕ = CuArray(CA.getdata(ϕ_ini))
+    xMg = CuArray(xM)
+    y_pred = predict_gf(rng, g_flux, f, ϕ, xMg, map(get_concrete, interpreters);
+        get_transPMs, get_ca_int_PMs, n_sample_pred)
+    @test y_pred isa Array
+    @test size(y_pred) == (size(y_o)..., n_sample_pred)
+end
+

From 7b892c6b1154ec15010cb1d086f55d4a05dea2e4 Mon Sep 17 00:00:00 2001
From: Thomas Wutzler <twutz@bgc-jena.mpg.de>
Date: Fri, 10 Jan 2025 10:42:37 +0100
Subject: [PATCH 4/6] typos

---
 src/elbo.jl                | 8 ++++----
 test/test_logden_normal.jl | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 9523994..15379bd 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -5,7 +5,7 @@ It generates n_MC samples for each site, and uses these to compute the
 expected value of the likelihood of observations.
 
 ## Arguments
-- rng: random number generator (ignored on CUDA, if ϕ is a AbstactGPUArray)
+- rng: random number generator (ignored on CUDA, if ϕ is a AbstractGPUArray)
 - g: machine learnig model
 - f: mechanistic model
 - ϕ: flat vector of parameters 
@@ -13,7 +13,7 @@ expected value of the likelihood of observations.
   interpreted by interpreters.μP_ϕg_unc and interpreters.PMs
 - y_ob: matrix of observations (n_obs x n_site_batch)
 - x: matrix of covariates (n_cov x n_site_batch)
-- transPMs: Transformations with componets P, Ms, similar to interpreters
+- transPMs: Transformations with components P, Ms, similar to interpreters
 - n_MC: number of MonteCarlo samples from the distribution of parameters to simulate
   using the mechanistic model f.
 - logσ2y: observation uncertainty (log of the variance)
@@ -38,7 +38,7 @@ end
         get_transPMs, get_ca_int_PMs, n_sample_pred=200, 
         gpu_data_handler=get_default_GPUHandler())
 
-Prediction function for hybrid model. Retuns an Array `(n_obs, n_site, n_sample_pred)`.
+Prediction function for hybrid model. Returns an Array `(n_obs, n_site, n_sample_pred)`.
 """
 function predict_gf(rng, g, f, ϕ::AbstractVector, xM::AbstractMatrix, interpreters;
     get_transPMs, get_ca_int_PMs, n_sample_pred=200, 
@@ -81,7 +81,7 @@ function generate_ζ(rng, g, f, ϕ::AbstractVector, x::AbstractMatrix,
 end
 
 """
-Extract relevant parameters from θ and return n_MC generted draws
+Extract relevant parameters from θ and return n_MC generated draws
 together with the logdet of the transformation.
 
 Necessary typestable information on number of compponents are provided with 
diff --git a/test/test_logden_normal.jl b/test/test_logden_normal.jl
index 91147c4..d1be1dc 100644
--- a/test/test_logden_normal.jl
+++ b/test/test_logden_normal.jl
@@ -12,7 +12,7 @@ using LinearAlgebra
     logden_norm_l(y, μ, logσ) = -1 / 2 .* (2 .* logσ .+ abs2.(y .- μ) ./ abs2.(exp.(logσ)))
     neg_logden_norm_l2(y, μ, logσ2) = (logσ2 .+ abs2.(y .- μ) .* exp.(-logσ2)) ./ 2
 
-    # first test that neg_logden_norm_l2 retuns values of logpdf(Normal) up to an additive 
+    # first test that neg_logden_norm_l2 returns values of logpdf(Normal) up to an additive 
     μ = [1.0, 1.0]
     σ = [1.1, 2.0]
     logσ2 = log.(abs2.(σ))

From 9e5e21c59ea69b2e8de9bc8352712e6f0e3b9198 Mon Sep 17 00:00:00 2001
From: Thomas Wutzler <twutz@bgc-jena.mpg.de>
Date: Fri, 10 Jan 2025 10:43:39 +0100
Subject: [PATCH 5/6] typos

---
 src/elbo.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/elbo.jl b/src/elbo.jl
index 15379bd..d7ed51d 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -6,7 +6,7 @@ expected value of the likelihood of observations.
 
 ## Arguments
 - rng: random number generator (ignored on CUDA, if ϕ is a AbstractGPUArray)
-- g: machine learnig model
+- g: machine learning model
 - f: mechanistic model
 - ϕ: flat vector of parameters 
   including parameter of f (ϕ_P), of g (ϕ_Ms), and of VI (ϕ_unc),

From c10b69e179098a104de1ee03eb3fcd296810bce9 Mon Sep 17 00:00:00 2001
From: Thomas Wutzler <twutz@bgc-jena.mpg.de>
Date: Mon, 13 Jan 2025 08:14:19 +0100
Subject: [PATCH 6/6] check for functional CUDA before GPU tests

---
 test/test_Flux.jl        | 10 +++--
 test/test_elbo.jl        | 77 ++++++++++++++++++-----------------
 test/test_sample_zeta.jl | 87 ++++++++++++++++++++++------------------
 3 files changed, 94 insertions(+), 80 deletions(-)

diff --git a/test/test_Flux.jl b/test/test_Flux.jl
index a3ba11b..ad49eb8 100644
--- a/test/test_Flux.jl
+++ b/test/test_Flux.jl
@@ -19,9 +19,11 @@ using Flux
     #typeof(HybridVariationalInference.default_GPU_DataHandler)
     h = get_default_GPUHandler()
     @test !(h isa NullGPUDataHandler)
-    x = CuArray(1:5)
-    xh = h(x)
-    @test xh isa Vector
+    if CUDA.functional()
+        x = CuArray(1:5)
+        xh = h(x)
+        @test xh isa Vector
+    end
 end;
 
 
@@ -32,7 +34,7 @@ end;
         Dense(n_covar => n_covar * 4, tanh),
         Dense(n_covar * 4 => n_covar * 4, tanh),
         Dense(n_covar * 4 => n_out, identity, bias=false),
-    );
+    )
     g = construct_FluxApplicator(g_chain)
     n_site = 3
     x = rand(Float32, n_covar, n_site)
diff --git a/test/test_elbo.jl b/test/test_elbo.jl
index 2fe1676..bcc3f7a 100644
--- a/test/test_elbo.jl
+++ b/test/test_elbo.jl
@@ -115,20 +115,22 @@ using Flux
 FluxMLengine = Val(nameof(Flux))
 g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario)
 
-@testset "generate_ζ gpu" begin
-    ϕ = CuArray(CA.getdata(ϕ_ini))
-    xMg_batch = CuArray(xM[:, 1:n_batch])
-    ζ, logdetΣ = CP.generate_ζ(
-        rng, g_flux, f, ϕ, xMg_batch, map(get_concrete, interpreters);
-        n_MC = 8)
-    @test ζ isa CuMatrix
-    gr = Zygote.gradient(
-        ϕ -> sum(CP.generate_ζ(
+if CUDA.functional()
+    @testset "generate_ζ gpu" begin
+        ϕ = CuArray(CA.getdata(ϕ_ini))
+        xMg_batch = CuArray(xM[:, 1:n_batch])
+        ζ, logdetΣ = CP.generate_ζ(
             rng, g_flux, f, ϕ, xMg_batch, map(get_concrete, interpreters);
-            n_MC = 8)[1]),
-        ϕ)
-    @test gr[1] isa CuVector
-end;
+            n_MC = 8)
+        @test ζ isa CuMatrix
+        gr = Zygote.gradient(
+            ϕ -> sum(CP.generate_ζ(
+                rng, g_flux, f, ϕ, xMg_batch, map(get_concrete, interpreters);
+                n_MC = 8)[1]),
+            ϕ)
+        @test gr[1] isa CuVector
+    end
+end
 
 @testset "neg_elbo_transnorm_gf cpu" begin
     cost = neg_elbo_transnorm_gf(rng, g, f, ϕ_ini, y_o[:, 1:n_batch], xM[:, 1:n_batch],
@@ -143,20 +145,22 @@ end;
     @test gr[1] isa Vector
 end;
 
-@testset "neg_elbo_transnorm_gf gpu" begin
-    ϕ = CuArray(CA.getdata(ϕ_ini))
-    xMg_batch = CuArray(xM[:, 1:n_batch])
-    cost = neg_elbo_transnorm_gf(rng, g_flux, f, ϕ, y_o[:, 1:n_batch], xMg_batch,
-        transPMs_batch, map(get_concrete, interpreters);
-        n_MC = 8, logσ2y)
-    @test cost isa Float64
-    gr = Zygote.gradient(
-        ϕ -> neg_elbo_transnorm_gf(
-            rng, g_flux, f, ϕ, y_o[:, 1:n_batch], xMg_batch,
-            transPMs_batch, interpreters; n_MC = 8, logσ2y),
-        ϕ)
-    @test gr[1] isa CuVector
-end;
+if CUDA.functional()
+    @testset "neg_elbo_transnorm_gf gpu" begin
+        ϕ = CuArray(CA.getdata(ϕ_ini))
+        xMg_batch = CuArray(xM[:, 1:n_batch])
+        cost = neg_elbo_transnorm_gf(rng, g_flux, f, ϕ, y_o[:, 1:n_batch], xMg_batch,
+            transPMs_batch, map(get_concrete, interpreters);
+            n_MC = 8, logσ2y)
+        @test cost isa Float64
+        gr = Zygote.gradient(
+            ϕ -> neg_elbo_transnorm_gf(
+                rng, g_flux, f, ϕ, y_o[:, 1:n_batch], xMg_batch,
+                transPMs_batch, interpreters; n_MC = 8, logσ2y),
+            ϕ)
+        @test gr[1] isa CuVector
+    end
+end
 
 @testset "predict_gf cpu" begin
     n_sample_pred = 200
@@ -166,13 +170,14 @@ end;
     @test size(y_pred) == (size(y_o)..., n_sample_pred)
 end
 
-@testset "predict_gf gpu" begin
-    n_sample_pred = 200
-    ϕ = CuArray(CA.getdata(ϕ_ini))
-    xMg = CuArray(xM)
-    y_pred = predict_gf(rng, g_flux, f, ϕ, xMg, map(get_concrete, interpreters);
-        get_transPMs, get_ca_int_PMs, n_sample_pred)
-    @test y_pred isa Array
-    @test size(y_pred) == (size(y_o)..., n_sample_pred)
+if CUDA.functional()
+    @testset "predict_gf gpu" begin
+        n_sample_pred = 200
+        ϕ = CuArray(CA.getdata(ϕ_ini))
+        xMg = CuArray(xM)
+        y_pred = predict_gf(rng, g_flux, f, ϕ, xMg, map(get_concrete, interpreters);
+            get_transPMs, get_ca_int_PMs, n_sample_pred)
+        @test y_pred isa Array
+        @test size(y_pred) == (size(y_o)..., n_sample_pred)
+    end
 end
-
diff --git a/test/test_sample_zeta.jl b/test/test_sample_zeta.jl
index 6447fcd..b27d0b0 100644
--- a/test/test_sample_zeta.jl
+++ b/test/test_sample_zeta.jl
@@ -23,7 +23,7 @@ scenario = (:default,)
 
 @testset "test_sample_zeta" begin
     (; xM, θP_true, θMs_true, xP, y_global_true, y_true, y_global_o, y_o
-    ) = gen_hybridcase_synthetic(case, rng; scenario);
+) = gen_hybridcase_synthetic(case, rng; scenario)
 
     # n_site = 2
     # n_θP, n_θM = length(θ_true.θP), length(θ_true.θM)
@@ -31,59 +31,66 @@ scenario = (:default,)
     # θMs_true = θ_true.θM .+ randn(n_θM, n_site) .* σ_θM 
 
     # set to 0.02 rather than zero for debugging non-zero correlations
-    ρsP = zeros(sum(1:(n_θP-1))) .+ 0.02
-    ρsM = zeros(sum(1:(n_θM-1))) .+ 0.02   
+    ρsP = zeros(sum(1:(n_θP - 1))) .+ 0.02
+    ρsM = zeros(sum(1:(n_θM - 1))) .+ 0.02
 
     ϕunc = CA.ComponentVector(;
-    logσ2_logP=fill(-10.0, n_θP),
-    coef_logσ2_logMs=reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
-    ρsP,
-    ρsM)
+        logσ2_logP = fill(-10.0, n_θP),
+        coef_logσ2_logMs = reduce(hcat, ([-10.0, 0.0] for _ in 1:n_θM)),
+        ρsP,
+        ρsM)
 
     transPMs = as(
-        (P=as(Array, asℝ₊, n_θP),
-        Ms=as(Array, asℝ₊, n_θM, n_site)))
+        (P = as(Array, asℝ₊, n_θP),
+        Ms = as(Array, asℝ₊, n_θM, n_site)))
     θ_true = θ = CA.ComponentVector(;
-        P=θP_true,
-        Ms=θMs_true)
+        P = θP_true,
+        Ms = θMs_true)
     transPMs = as((
-        P=as(Array, asℝ₊, n_θP),
-        Ms=as(Array, asℝ₊, n_θM, n_site)))
+        P = as(Array, asℝ₊, n_θP),
+        Ms = as(Array, asℝ₊, n_θM, n_site)))
     ζ_true = inverse_ca(transPMs, θ_true)
-    ϕ_true = vcat(ζ_true, CA.ComponentVector(unc=ϕunc))
-    ϕ_cpu = vcat(ζ_true .+ 0.01, CA.ComponentVector(unc=ϕunc))
+    ϕ_true = vcat(ζ_true, CA.ComponentVector(unc = ϕunc))
+    ϕ_cpu = vcat(ζ_true .+ 0.01, CA.ComponentVector(unc = ϕunc))
 
-    interpreters = (; pmu=ComponentArrayInterpreter(ϕ_true)) #, M=int_θM, PMs=int_θPMs)
+    interpreters = (; pmu = ComponentArrayInterpreter(ϕ_true)) #, M=int_θM, PMs=int_θPMs)
 
     n_MC = 3
     @testset "sample_ζ_norm0 cpu" begin
         ϕ = CA.getdata(ϕ_cpu)
         ϕc = interpreters.pmu(ϕ)
         ζ_resid, logdetΣ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
-        @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site,  n_MC)
-        gr = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc)[1]), ϕc)[1] 
-        @test length(gr) == length(ϕ) 
-    end;
+        @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site, n_MC)
+        gr = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc)[1]), ϕc)[1]
+        @test length(gr) == length(ϕ)
+    end
     #
-    @testset "sample_ζ_norm0 gpu" begin
-        ϕ = CuArray(CA.getdata(ϕ_cpu));
-        #tmp = ϕ[1:6]
-        #vec2uutri(tmp)
-        ϕc = interpreters.pmu(ϕ);
-        @test CA.getdata(ϕc) isa GPUArraysCore.AbstractGPUArray
-        #ζP, ζMs, ϕunc = ϕc.P, ϕc.Ms, ϕc.unc
-        #urand = CUDA.randn(length(ϕc.P) + length(ϕc.Ms), n_MC) |> gpu
-        #include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss
-        #ζ_resid, logdetΣ = sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
-        #Zygote.gradient(ϕc -> sum(sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)[1]), ϕc)[1]; 
-        ζ_resid, logdetΣ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
-        @test ζ_resid isa GPUArraysCore.AbstractGPUArray
-        @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site,  n_MC)
-        gr = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)[1]), ϕc)[1]; 
-        @test length(gr) == length(ϕ) 
-        int_unc = ComponentArrayInterpreter(ϕc.unc)
-        gr2 = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(rng, CA.getdata(ϕc.P), CA.getdata(ϕc.Ms), CA.getdata(ϕc.unc), int_unc; n_MC)[1]), ϕc)[1]; 
-    end;
+
+    if CUDA.functional()
+        @testset "sample_ζ_norm0 gpu" begin
+            ϕ = CuArray(CA.getdata(ϕ_cpu))
+            #tmp = ϕ[1:6]
+            #vec2uutri(tmp)
+            ϕc = interpreters.pmu(ϕ)
+            @test CA.getdata(ϕc) isa GPUArraysCore.AbstractGPUArray
+            #ζP, ζMs, ϕunc = ϕc.P, ϕc.Ms, ϕc.unc
+            #urand = CUDA.randn(length(ϕc.P) + length(ϕc.Ms), n_MC) |> gpu
+            #include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss
+            #ζ_resid, logdetΣ = sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
+            #Zygote.gradient(ϕc -> sum(sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)[1]), ϕc)[1]; 
+            ζ_resid, logdetΣ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
+            @test ζ_resid isa GPUArraysCore.AbstractGPUArray
+            @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site, n_MC)
+            gr = Zygote.gradient(
+                ϕc -> sum(CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)[1]), ϕc)[1]
+            @test length(gr) == length(ϕ)
+            int_unc = ComponentArrayInterpreter(ϕc.unc)
+            gr2 = Zygote.gradient(
+                ϕc -> sum(CP.sample_ζ_norm0(rng, CA.getdata(ϕc.P), CA.getdata(ϕc.Ms),
+                    CA.getdata(ϕc.unc), int_unc; n_MC)[1]),
+                ϕc)[1]
+        end
+    end
 
     # @testset "generate_ζ" begin
     #     ϕ = CA.getdata(ϕ_cpu)
@@ -94,6 +101,6 @@ scenario = (:default,)
     #     int_μP_ϕg_unc=ComponentArrayInterpreter(ϕ_true)
     #     interpreters = (; PMs = intm_PMs_gen, μP_ϕg_unc = int_μP_ϕg_unc  )
     #     ζs, _ = CP.generate_ζ(rng, g, f, ϕ, xM, interpreters; n_MC=n_sample_pred)
-        
+
     # end;
 end;