From 84925207fa4cccfe6a5ecae335e89bf6aac73c4d Mon Sep 17 00:00:00 2001 From: Thomas Wutzler Date: Wed, 15 Jan 2025 14:56:14 +0100 Subject: [PATCH] adapt ELBO to updated derivation with entropy over standard Normal but log_det of the multiplcation with Cholesky factor. --- Project.toml | 2 +- dev/doubleMM.jl | 24 +++++++++++++++--------- src/elbo.jl | 33 +++++++++++++++++++-------------- test/test_cholesky_structure.jl | 2 +- test/test_elbo.jl | 4 ++-- test/test_logden_normal.jl | 2 +- test/test_sample_zeta.jl | 6 +++--- 7 files changed, 42 insertions(+), 31 deletions(-) diff --git a/Project.toml b/Project.toml index 0c86f31..9d93b20 100644 --- a/Project.toml +++ b/Project.toml @@ -33,7 +33,7 @@ CUDA = "5.5.2" ChainRulesCore = "1.25" Combinatorics = "1.0.2" ComponentArrays = "0.15.19" -Flux = "v0.15.2" +Flux = "v0.15.2, 0.16" GPUArraysCore = "0.1, 0.2" LinearAlgebra = "1.10.0" Lux = "1.4.2" diff --git a/dev/doubleMM.jl b/dev/doubleMM.jl index a81d437..9061e27 100644 --- a/dev/doubleMM.jl +++ b/dev/doubleMM.jl @@ -186,7 +186,7 @@ end #res = Optimization.solve(optprob, Adam(0.02), callback=callback_loss(50), maxiters=1_400); end -ϕ = ϕ_true |> Flux.gpu; +ϕ = ϕ_ini |> Flux.gpu; xM_gpu = xM |> Flux.gpu; g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario); @@ -213,14 +213,15 @@ g_flux, ϕg0_flux_cpu = gen_hybridcase_MLapplicator(case, FluxMLengine; scenario g_flux = g_luxs end -function fcost(ϕ) - neg_elbo_transnorm_gf(rng, g_flux, f, CA.getdata(ϕ), y_o[:, 1:n_batch], - xM_gpu[:, 1:n_batch], transPMs_batch, map(get_concrete, interpreters); +function fcost(ϕ, xM, y_o) + neg_elbo_transnorm_gf(rng, g_flux, f, CA.getdata(ϕ), y_o, + xM, transPMs_batch, map(get_concrete, interpreters); n_MC = 8, logσ2y = logσ2y) end -fcost(ϕ) +fcost(ϕ, xM_gpu[:, 1:n_batch], y_o[:, 1:n_batch]) #Zygote.gradient(fcost, ϕ) |> cpu; -gr = Zygote.gradient(fcost, CA.getdata(ϕ)); +gr = Zygote.gradient(fcost, + CA.getdata(ϕ), CA.getdata(xM_gpu[:, 1:n_batch]), CA.getdata(y_o[:, 1:n_batch])); gr_c = CA.ComponentArray(gr[1] |> Flux.cpu, CA.getaxes(ϕ)...) train_loader = MLUtils.DataLoader((xM_gpu, y_o), batchsize = n_batch) @@ -228,7 +229,7 @@ train_loader = MLUtils.DataLoader((xM_gpu, y_o), batchsize = n_batch) optf = Optimization.OptimizationFunction( (ϕ, data) -> begin xM, y_o = data - fcost(ϕ) + fcost(ϕ, xM, y_o) # neg_elbo_transnorm_gf( # rng, g_flux, f, ϕ, y_o, xM, transPMs_batch, # map(get_concrete, interpreters); n_MC = 5, logσ2y) @@ -259,22 +260,27 @@ hcat(ϕ_ini.unc, ϕunc_VI) # need to compare to MC sample # hard to estimate for original very small theta's but otherwise good # test predicting correct obs-uncertainty of predictive posterior -# TODO reuse g_flux rather than g n_sample_pred = 200 + y_pred = predict_gf(rng, g_flux, f, res.u, xM_gpu, interpreters; get_transPMs, get_ca_int_PMs, n_sample_pred); size(y_pred) # n_obs x n_site, n_sample_pred -σ_o_post = dropdims(std(y_pred; dims = 3), dims=3) +σ_o_post = dropdims(std(y_pred; dims = 3), dims = 3); #describe(σ_o_post) hcat(σ_o, fill(mean_σ_o_MC, length(σ_o)), mean(σ_o_post, dims = 2), sqrt.(mean(abs2, σ_o_post, dims = 2))) +# VI predicted uncertainty is smaller than HMC predicted one mean_y_pred = map(mean, eachslice(y_pred; dims = (1, 2))) #describe(mean_y_pred - y_o) histogram(vec(mean_y_pred - y_true)) # predictions centered around y_o (or y_true) # look at θP, θM1 of first site +intm_PMs_gen = get_ca_int_PMs(n_site) +ζs, _σ = HVI.generate_ζ(rng, g_flux, f, res.u, xM_gpu, + (; interpreters..., PMs = intm_PMs_gen); n_MC = n_sample_pred); +ζs = ζs |> Flux.cpu; θPM = vcat(θP_true, θMs_true[:, 1]) intm = ComponentArrayInterpreter(θPM, (n_sample_pred,)) ζs1c = intm(ζs[1:length(θPM), :]) diff --git a/src/elbo.jl b/src/elbo.jl index d7ed51d..b1ba9af 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -20,8 +20,10 @@ expected value of the likelihood of observations. """ function neg_elbo_transnorm_gf(rng, g, f, ϕ::AbstractVector, y_ob, x::AbstractMatrix, transPMs, interpreters::NamedTuple; - n_MC=3, logσ2y, gpu_data_handler = get_default_GPUHandler()) - ζs, logdetΣ = generate_ζ(rng, g, f, ϕ, x, interpreters; n_MC) + n_MC=3, logσ2y, gpu_data_handler = get_default_GPUHandler(), + entropyN = 0.0, + ) + ζs, σ = generate_ζ(rng, g, f, ϕ, x, interpreters; n_MC) ζs_cpu = gpu_data_handler(ζs) # differentiable fetch to CPU in Flux package extension #ζi = first(eachcol(ζs_cpu)) nLy = reduce(+, map(eachcol(ζs_cpu)) do ζi @@ -29,8 +31,11 @@ function neg_elbo_transnorm_gf(rng, g, f, ϕ::AbstractVector, y_ob, x::AbstractM nLy1 = neg_logden_indep_normal(y_ob, y_pred_i, logσ2y) nLy1 - logjac end) / n_MC - ent = entropy_MvNormal(size(ζs, 1), logdetΣ) # defined in logden_normal - nLy - ent + logdet_jacT2 = sum_log_σ = sum(log.(σ)) + # logdet_jacT2 = -sum_log_σ # log Prod(1/σ_i) = -sum log σ_i + # logdetΣ = 2 * sum_log_σ # log Prod(σ_i²) = 2* sum log σ_i + # ent = entropy_MvNormal(size(ζs, 1), logdetΣ) # defined in logden_normal + nLy - logdet_jacT2 - entropyN end """ @@ -45,17 +50,17 @@ function predict_gf(rng, g, f, ϕ::AbstractVector, xM::AbstractMatrix, interpret gpu_data_handler=get_default_GPUHandler()) n_site = size(xM, 2) intm_PMs_gen = get_ca_int_PMs(n_site) - tans_PMs_gen = get_transPMs(n_site) + trans_PMs_gen = get_transPMs(n_site) ζs, _ = generate_ζ(rng, g, f, CA.getdata(ϕ), CA.getdata(xM), (; interpreters..., PMs = intm_PMs_gen); n_MC = n_sample_pred) ζs_cpu = gpu_data_handler(ζs) # - y_pred = stack(map(ζ -> first(predict_y(ζ, f, tans_PMs_gen)), eachcol(ζs_cpu))); + y_pred = stack(map(ζ -> first(predict_y(ζ, f, trans_PMs_gen)), eachcol(ζs_cpu))); y_pred end """ -Generate samples of (inv-transformed) model parameters, ζ, and Log-Determinant -of their distribution. +Generate samples of (inv-transformed) model parameters, ζ, +and the vector of standard deviations, σ, i.e. the diagonal of the cholesky-factor. Adds the MV-normally distributed residuals, retrieved by `sample_ζ_norm0` to the means extracted from parameters and predicted by the machine learning @@ -68,8 +73,8 @@ function generate_ζ(rng, g, f, ϕ::AbstractVector, x::AbstractMatrix, μ_ζP = ϕc.μP ϕg = ϕc.ϕg μ_ζMs0 = g(x, ϕg) # TODO provide μ_ζP to g - ζ_resid, logdetΣ = sample_ζ_norm0(rng, μ_ζP, μ_ζMs0, ϕc.unc; n_MC) - #ζ_resid, logdetΣ = sample_ζ_norm0(rng, ϕ[1:2], reshape(ϕ[2 .+ (1:20)],2,:), ϕ[(end-length(interpreters.unc)+1):end], interpreters.unc; n_MC) + ζ_resid, σ = sample_ζ_norm0(rng, μ_ζP, μ_ζMs0, ϕc.unc; n_MC) + #ζ_resid, σ = sample_ζ_norm0(rng, ϕ[1:2], reshape(ϕ[2 .+ (1:20)],2,:), ϕ[(end-length(interpreters.unc)+1):end], interpreters.unc; n_MC) ζ = stack(map(eachcol(ζ_resid)) do r rc = interpreters.PMs(r) ζP = μ_ζP .+ rc.θP @@ -77,12 +82,12 @@ function generate_ζ(rng, g, f, ϕ::AbstractVector, x::AbstractMatrix, ζMs = μ_ζMs .+ rc.θMs vcat(ζP, vec(ζMs)) end) - ζ, logdetΣ + ζ, σ end """ Extract relevant parameters from θ and return n_MC generated draws -together with the logdet of the transformation. +together with the vector of standard deviations, σ. Necessary typestable information on number of compponents are provided with ComponentMarshellers @@ -115,9 +120,9 @@ function sample_ζ_norm0(urand::AbstractMatrix, ζP::AbstractVector{T}, ζMs::Ab # need to construct full matrix for CUDA Uσ = _create_blockdiag(UP, UM, σP, σMs, n_batch) ζ_resid = Uσ' * urand - logdetΣ = 2 .* sum(log.(diag(Uσ))) + σ = diag(Uσ) # elements of the diagonal: standard deviations # returns CuArrays to either continue on GPU or need to transfer to CPU - ζ_resid, logdetΣ + ζ_resid, σ end function _create_blockdiag(UP::AbstractMatrix{T}, UM, σP, σMs, n_batch) where {T} diff --git a/test/test_cholesky_structure.jl b/test/test_cholesky_structure.jl index 6566716..58a8624 100644 --- a/test/test_cholesky_structure.jl +++ b/test/test_cholesky_structure.jl @@ -241,7 +241,7 @@ end optprob = Optimization.OptimizationProblem(optf, Us1vec0) res = Optimization.solve(optprob, OptimizationOptimisers.Adam(0.02), #callback=callback_loss(50), - maxiters = 800) + maxiters = 1_000) Upred = CP.transformU_cholesky1(res.u; n = n_U) #@test Upred ≈ CU diff --git a/test/test_elbo.jl b/test/test_elbo.jl index bcc3f7a..cbe2199 100644 --- a/test/test_elbo.jl +++ b/test/test_elbo.jl @@ -98,7 +98,7 @@ n_MC = 3 end @testset "generate_ζ" begin - ζ, logdetΣ = CP.generate_ζ( + ζ, σ = CP.generate_ζ( rng, g, f, ϕ_ini, xM[:, 1:n_batch], map(get_concrete, interpreters); n_MC = 8) @test ζ isa Matrix @@ -119,7 +119,7 @@ if CUDA.functional() @testset "generate_ζ gpu" begin ϕ = CuArray(CA.getdata(ϕ_ini)) xMg_batch = CuArray(xM[:, 1:n_batch]) - ζ, logdetΣ = CP.generate_ζ( + ζ, σ = CP.generate_ζ( rng, g_flux, f, ϕ, xMg_batch, map(get_concrete, interpreters); n_MC = 8) @test ζ isa CuMatrix diff --git a/test/test_logden_normal.jl b/test/test_logden_normal.jl index d1be1dc..a27fdc0 100644 --- a/test/test_logden_normal.jl +++ b/test/test_logden_normal.jl @@ -33,7 +33,7 @@ end; @testset "entropy_MvNormal" begin S = Diagonal([4,5]) .+ rand(2,2) S2 = Symmetric(S*S) - @test entropy_MvNormal(S2) == entropy(MvNormal(S2)) + @test entropy_MvNormal(S2) ≈ entropy(MvNormal(S2)) end; diff --git a/test/test_sample_zeta.jl b/test/test_sample_zeta.jl index b27d0b0..3befbc9 100644 --- a/test/test_sample_zeta.jl +++ b/test/test_sample_zeta.jl @@ -59,7 +59,7 @@ scenario = (:default,) @testset "sample_ζ_norm0 cpu" begin ϕ = CA.getdata(ϕ_cpu) ϕc = interpreters.pmu(ϕ) - ζ_resid, logdetΣ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC) + ζ_resid, σ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC) @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site, n_MC) gr = Zygote.gradient(ϕc -> sum(CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc)[1]), ϕc)[1] @test length(gr) == length(ϕ) @@ -76,9 +76,9 @@ scenario = (:default,) #ζP, ζMs, ϕunc = ϕc.P, ϕc.Ms, ϕc.unc #urand = CUDA.randn(length(ϕc.P) + length(ϕc.Ms), n_MC) |> gpu #include(joinpath(@__DIR__, "uncNN", "elbo.jl")) # callback_loss - #ζ_resid, logdetΣ = sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC) + #ζ_resid, σ = sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC) #Zygote.gradient(ϕc -> sum(sample_ζ_norm0(urand, ϕc.P, ϕc.Ms, ϕc.unc; n_MC)[1]), ϕc)[1]; - ζ_resid, logdetΣ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC) + ζ_resid, σ = CP.sample_ζ_norm0(rng, ϕc.P, ϕc.Ms, ϕc.unc; n_MC) @test ζ_resid isa GPUArraysCore.AbstractGPUArray @test size(ζ_resid) == (length(ϕc.P) + n_θM * n_site, n_MC) gr = Zygote.gradient(