From 84925207fa4cccfe6a5ecae335e89bf6aac73c4d Mon Sep 17 00:00:00 2001
From: Thomas Wutzler <twutz@bgc-jena.mpg.de>
Date: Wed, 15 Jan 2025 14:56:14 +0100
Subject: [PATCH] adapt ELBO to updated derivation

with entropy over standard Normal but log_det of the multiplcation with Cholesky factor.
---
 Project.toml                    |  2 +-
 dev/doubleMM.jl                 | 24 +++++++++++++++---------
 src/elbo.jl                     | 33 +++++++++++++++++++--------------
 test/test_cholesky_structure.jl |  2 +-
 test/test_elbo.jl               |  4 ++--
 test/test_logden_normal.jl      |  2 +-
 test/test_sample_zeta.jl        |  6 +++---
 7 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/Project.toml b/Project.toml
index 0c86f31..9d93b20 100644
--- a/Project.toml
+++ b/Project.toml
@@ -33,7 +33,7 @@ CUDA = "5.5.2"
 ChainRulesCore = "1.25"
 Combinatorics = "1.0.2"
 ComponentArrays = "0.15.19"
-Flux = "v0.15.2"
+Flux = "v0.15.2, 0.16"
 GPUArraysCore = "0.1, 0.2"
 LinearAlgebra = "1.10.0"
 Lux = "1.4.2"
diff --git a/dev/doubleMM.jl b/dev/doubleMM.jl
index a81d437..9061e27 100644
--- a/dev/doubleMM.jl
+++ b/dev/doubleMM.jl
@@ -186,7 +186,7 @@ end
     #res = Optimization.solve(optprob, Adam(0.02), callback=callback_loss(50), maxiters=1_400);
 end
 
-ϕ = ϕ_true |> Flux.gpu;
+ϕ = ϕ_ini |> Flux.gpu;
 xM_gpu = xM |> Flux.gpu;
 g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario);
 
@@ -213,14 +213,15 @@ g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario
     g_flux = g_luxs
 end
 
-function fcost(ϕ)
-    neg_elbo_transnorm_gf(rng, g_flux, f, CA.getdata(ϕ), y_o[:, 1:n_batch],
-        xM_gpu[:, 1:n_batch], transPMs_batch, map(get_concrete, interpreters);
+function fcost(ϕ, xM, y_o)
+    neg_elbo_transnorm_gf(rng, g_flux, f, CA.getdata(ϕ), y_o,
+        xM, transPMs_batch, map(get_concrete, interpreters);
         n_MC = 8, logσ2y = logσ2y)
 end
-fcost(ϕ)
+fcost(ϕ, xM_gpu[:, 1:n_batch], y_o[:, 1:n_batch])
 #Zygote.gradient(fcost, ϕ) |> cpu;
-gr = Zygote.gradient(fcost, CA.getdata(ϕ));
+gr = Zygote.gradient(fcost,
+    CA.getdata(ϕ), CA.getdata(xM_gpu[:, 1:n_batch]), CA.getdata(y_o[:, 1:n_batch]));
 gr_c = CA.ComponentArray(gr[1] |> Flux.cpu, CA.getaxes(ϕ)...)
 
 train_loader = MLUtils.DataLoader((xM_gpu, y_o), batchsize = n_batch)
@@ -228,7 +229,7 @@ train_loader = MLUtils.DataLoader((xM_gpu, y_o), batchsize = n_batch)
 optf = Optimization.OptimizationFunction(
     (ϕ, data) -> begin
         xM, y_o = data
-        fcost(ϕ)
+        fcost(ϕ, xM, y_o)
         # neg_elbo_transnorm_gf(
         #     rng, g_flux, f, ϕ, y_o, xM, transPMs_batch,
         #     map(get_concrete, interpreters); n_MC = 5, logσ2y)
@@ -259,22 +260,27 @@ hcat(ϕ_ini.unc, ϕunc_VI) # need to compare to MC sample
 # hard to estimate for original very small theta's but otherwise good
 
 # test predicting correct obs-uncertainty of predictive posterior
-# TODO reuse g_flux rather than g
 n_sample_pred = 200
+
 y_pred = predict_gf(rng, g_flux, f, res.u, xM_gpu, interpreters;
     get_transPMs, get_ca_int_PMs, n_sample_pred);
 size(y_pred) # n_obs x n_site, n_sample_pred
 
-σ_o_post = dropdims(std(y_pred; dims = 3), dims=3)
+σ_o_post = dropdims(std(y_pred; dims = 3), dims = 3);
 
 #describe(σ_o_post)
 hcat(σ_o, fill(mean_σ_o_MC, length(σ_o)),
     mean(σ_o_post, dims = 2), sqrt.(mean(abs2, σ_o_post, dims = 2)))
+# VI predicted uncertainty is smaller than HMC predicted one
 mean_y_pred = map(mean, eachslice(y_pred; dims = (1, 2)))
 #describe(mean_y_pred - y_o)
 histogram(vec(mean_y_pred - y_true)) # predictions centered around y_o (or y_true)
 
 # look at θP, θM1 of first site
+intm_PMs_gen = get_ca_int_PMs(n_site)
+ζs, _σ = HVI.generate_ζ(rng, g_flux, f, res.u, xM_gpu,
+    (; interpreters..., PMs = intm_PMs_gen); n_MC = n_sample_pred);
+ζs = ζs |> Flux.cpu;
 θPM = vcat(θP_true, θMs_true[:, 1])
 intm = ComponentArrayInterpreter(θPM, (n_sample_pred,))
 ζs1c = intm(ζs[1:length(θPM), :])
diff --git a/src/elbo.jl b/src/elbo.jl
index d7ed51d..b1ba9af 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -20,8 +20,10 @@ expected value of the likelihood of observations.
 """
 function neg_elbo_transnorm_gf(rng, g, f, ϕ::AbstractVector, y_ob, x::AbstractMatrix,
     transPMs, interpreters::NamedTuple; 
-    n_MC=3, logσ2y, gpu_data_handler = get_default_GPUHandler())
-    ζs, logdetΣ = generate_ζ(rng, g, f, ϕ, x, interpreters; n_MC)
+    n_MC=3, logσ2y, gpu_data_handler = get_default_GPUHandler(),
+    entropyN = 0.0,
+    )
+    ζs, σ = generate_ζ(rng, g, f, ϕ, x, interpreters; n_MC)
     ζs_cpu = gpu_data_handler(ζs) # differentiable fetch to CPU in Flux package extension
     #ζi = first(eachcol(ζs_cpu))
     nLy = reduce(+, map(eachcol(ζs_cpu)) do ζi
@@ -29,8 +31,11 @@ function neg_elbo_transnorm_gf(rng, g, f, ϕ::AbstractVector, y_ob, x::AbstractM
         nLy1 = neg_logden_indep_normal(y_ob, y_pred_i, logσ2y)
         nLy1 - logjac
     end) / n_MC
-    ent = entropy_MvNormal(size(ζs, 1), logdetΣ)  # defined in logden_normal
-    nLy - ent
+    logdet_jacT2 = sum_log_σ = sum(log.(σ))
+    # logdet_jacT2 = -sum_log_σ # log Prod(1/σ_i) = -sum log σ_i 
+    # logdetΣ = 2 * sum_log_σ   #  log Prod(σ_i²) = 2* sum log σ_i
+    # ent = entropy_MvNormal(size(ζs, 1), logdetΣ)  # defined in logden_normal
+    nLy - logdet_jacT2 - entropyN
 end
 
 """
@@ -45,17 +50,17 @@ function predict_gf(rng, g, f, ϕ::AbstractVector, xM::AbstractMatrix, interpret
     gpu_data_handler=get_default_GPUHandler())
     n_site = size(xM, 2)
     intm_PMs_gen = get_ca_int_PMs(n_site)
-    tans_PMs_gen = get_transPMs(n_site)
+    trans_PMs_gen = get_transPMs(n_site)
     ζs, _ = generate_ζ(rng, g, f, CA.getdata(ϕ), CA.getdata(xM),
     (; interpreters..., PMs = intm_PMs_gen); n_MC = n_sample_pred)
     ζs_cpu = gpu_data_handler(ζs) #
-    y_pred = stack(map(ζ -> first(predict_y(ζ, f, tans_PMs_gen)), eachcol(ζs_cpu)));
+    y_pred = stack(map(ζ -> first(predict_y(ζ, f, trans_PMs_gen)), eachcol(ζs_cpu)));
     y_pred
 end
 
 """
-Generate samples of (inv-transformed) model parameters, ζ, and Log-Determinant
-of their distribution.
+Generate samples of (inv-transformed) model parameters, ζ, 
+and the vector of standard deviations, σ, i.e. the diagonal of the cholesky-factor.
 
 Adds the MV-normally distributed residuals, retrieved by `sample_ζ_norm0`
 to the means extracted from parameters and predicted by the machine learning
@@ -68,8 +73,8 @@ function generate_ζ(rng, g, f, ϕ::AbstractVector, x::AbstractMatrix,
     μ_ζP = ϕc.μP
     ϕg = ϕc.ϕg
     μ_ζMs0 = g(x, ϕg) # TODO provide μ_ζP to g
-    ζ_resid, logdetΣ = sample_ζ_norm0(rng, μ_ζP, μ_ζMs0, ϕc.unc; n_MC)
-    #ζ_resid, logdetΣ = sample_ζ_norm0(rng, ϕ[1:2], reshape(ϕ[2 .+ (1:20)],2,:), ϕ[(end-length(interpreters.unc)+1):end], interpreters.unc; n_MC)
+    ζ_resid, σ = sample_ζ_norm0(rng, μ_ζP, μ_ζMs0, ϕc.unc; n_MC)
+    #ζ_resid, σ = sample_ζ_norm0(rng, ϕ[1:2], reshape(ϕ[2 .+ (1:20)],2,:), ϕ[(end-length(interpreters.unc)+1):end], interpreters.unc; n_MC)
     ζ = stack(map(eachcol(ζ_resid)) do r
         rc = interpreters.PMs(r)
         ζP = μ_ζP .+ rc.θP
@@ -77,12 +82,12 @@ function generate_ζ(rng, g, f, ϕ::AbstractVector, x::AbstractMatrix,
         ζMs = μ_ζMs .+ rc.θMs
         vcat(ζP, vec(ζMs))
     end)
-    ζ, logdetΣ
+    ζ, σ
 end
 
 """
 Extract relevant parameters from θ and return n_MC generated draws
-together with the logdet of the transformation.
+together with the vector of standard deviations, σ.
 
 Necessary typestable information on number of compponents are provided with 
 ComponentMarshellers
@@ -115,9 +120,9 @@ function sample_ζ_norm0(urand::AbstractMatrix, ζP::AbstractVector{T}, ζMs::Ab
     # need to construct full matrix for CUDA
     Uσ = _create_blockdiag(UP, UM, σP, σMs, n_batch) 
     ζ_resid = Uσ' * urand
-    logdetΣ = 2 .* sum(log.(diag(Uσ)))
+    σ = diag(Uσ)   # elements of the diagonal: standard deviations
     # returns CuArrays to either continue on GPU or need to transfer to CPU
-    ζ_resid, logdetΣ
+    ζ_resid, σ
 end
 
 function _create_blockdiag(UP::AbstractMatrix{T}, UM, σP, σMs, n_batch) where {T}
diff --git a/test/test_cholesky_structure.jl b/test/test_cholesky_structure.jl
index 6566716..58a8624 100644
--- a/test/test_cholesky_structure.jl
+++ b/test/test_cholesky_structure.jl
@@ -241,7 +241,7 @@ end
     optprob = Optimization.OptimizationProblem(optf, Us1vec0)
     res = Optimization.solve(optprob, OptimizationOptimisers.Adam(0.02),
         #callback=callback_loss(50), 
-        maxiters = 800)
+        maxiters = 1_000)
 
     Upred = CP.transformU_cholesky1(res.u; n = n_U)
     #@test Upred ≈ CU
diff --git a/test/test_elbo.jl b/test/test_elbo.jl
index bcc3f7a..cbe2199 100644
--- a/test/test_elbo.jl
+++ b/test/test_elbo.jl
@@ -98,7 +98,7 @@ n_MC = 3
 end
 
 @testset "generate_ζ" begin
-    ζ, logdetΣ = CP.generate_ζ(
+    ζ, σ = CP.generate_ζ(
         rng, g, f, ϕ_ini, xM[:, 1:n_batch], map(get_concrete, interpreters);
         n_MC = 8)
     @test ζ isa Matrix
@@ -119,7 +119,7 @@ if CUDA.functional()
     @testset "generate_ζ gpu" begin
         ϕ = CuArray(CA.getdata(ϕ_ini))
         xMg_batch = CuArray(xM[:, 1:n_batch])
-        ζ, logdetΣ = CP.generate_ζ(
+        ζ, σ = CP.generate_ζ(
             rng, g_flux, f, ϕ, xMg_batch, map(get_concrete, interpreters);
             n_MC = 8)
         @test ζ isa CuMatrix
diff --git a/test/test_logden_normal.jl b/test/test_logden_normal.jl
index d1be1dc..a27fdc0 100644
--- a/test/test_logden_normal.jl
+++ b/test/test_logden_normal.jl
@@ -33,7 +33,7 @@ end;
 @testset "entropy_MvNormal" begin
     S = Diagonal([4,5]) .+ rand(2,2)
     S2 = Symmetric(S*S)
-    @test entropy_MvNormal(S2) == entropy(MvNormal(S2))
+    @test entropy_MvNormal(S2) ≈ entropy(MvNormal(S2))
 end;
 
 
diff --git a/test/test_sample_zeta.jl b/test/test_sample_zeta.jl
index b27d0b0..3befbc9 100644
--- a/test/test_sample_zeta.jl
+++ b/test/test_sample_zeta.jl
@@ -59,7 +59,7 @@ scenario = (:default,)
     @testset "sample_ζ_norm0 cpu" begin
         ϕ = CA.getdata(ϕ_cpu)
         ϕc = interpreters.pmu(ϕ)
-        ζ_resid, logdetΣ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
+        ζ_resid, σ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
         @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site, n_MC)
         gr = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc)[1]), ϕc)[1]
         @test length(gr) == length(ϕ)
@@ -76,9 +76,9 @@ scenario = (:default,)
             #ζP, ζMs, ϕunc = ϕc.P, ϕc.Ms, ϕc.unc
             #urand = CUDA.randn(length(ϕc.P) + length(ϕc.Ms), n_MC) |> gpu
             #include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss
-            #ζ_resid, logdetΣ = sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
+            #ζ_resid, σ = sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
             #Zygote.gradient(ϕc -> sum(sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)[1]), ϕc)[1]; 
-            ζ_resid, logdetΣ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
+            ζ_resid, σ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)
             @test ζ_resid isa GPUArraysCore.AbstractGPUArray
             @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site, n_MC)
             gr = Zygote.gradient(