-
-
Notifications
You must be signed in to change notification settings - Fork 198
Add Wolfe line search to Laplace approximation #3250
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
43c3eef
74a92bc
2de7a4f
547288e
cb5e282
cdaf700
6b22c85
a1d0906
5b6ffff
8eff766
f542cc5
c845944
40f1243
6e528d2
d89eeb5
40d889f
59b7a2f
c73f5aa
773d417
b557dad
b18bf87
98df588
929dd47
2ebb01a
3bbcef3
66ffec9
ff5bee4
7a7415a
973144a
cc5d49a
d759fdd
7b4e3a1
dfba08b
7df0ed1
0c92732
d19ee8b
82e43da
22a2210
24e2e19
7720c7a
63e1700
a9f17d4
28c44dd
fddf54f
113e2b1
88a8950
c4fcba2
521145f
d648ee0
7778307
04b5b2e
4117a31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,11 +8,49 @@ | |
| namespace stan { | ||
| namespace math { | ||
|
|
||
| namespace internal { | ||
| /** | ||
| * Set all adjoints of the output to zero. | ||
| */ | ||
| template <typename Output> | ||
| inline void set_zero_adjoint(Output&& output) { | ||
| if constexpr (is_all_arithmetic_scalar_v<Output>) { | ||
| return; | ||
| } else { | ||
| return iter_tuple_nested( | ||
| [](auto&& output_i) { | ||
| using output_i_t = std::decay_t<decltype(output_i)>; | ||
| if constexpr (is_all_arithmetic_scalar_v<output_i_t>) { | ||
| return; | ||
| } else if constexpr (is_std_vector<output_i_t>::value) { | ||
| for (Eigen::Index i = 0; i < output_i.size(); ++i) { | ||
| output_i[i].adj() = 0; | ||
| } | ||
| } else if constexpr (is_eigen_v<output_i_t>) { | ||
| output_i.adj().setZero(); | ||
| } else if constexpr (is_stan_scalar_v<output_i_t>) { | ||
| output_i.adj() = 0; | ||
| } else { | ||
| static_assert( | ||
| sizeof(std::decay_t<output_i_t>*) == 0, | ||
| "INTERNAL ERROR:(laplace_marginal_lpdf) set_zero_adjoints was " | ||
| "not able to deduce the actions needed for the given type. " | ||
| "This is an internal error, please report it: " | ||
| "https://github.com/stan-dev/math/issues"); | ||
| } | ||
| }, | ||
| std::forward<Output>(output)); | ||
| } | ||
| } | ||
|
|
||
| } // namespace internal | ||
|
|
||
| /** | ||
| * functions to compute the log density, first, second, | ||
| * and third-order derivatives for a likelihoood specified by the user. | ||
| */ | ||
| namespace laplace_likelihood { | ||
|
|
||
| namespace internal { | ||
| /** | ||
| * @tparam F A functor with `opertor()(Args&&...)` returning a scalar | ||
|
|
@@ -106,6 +144,126 @@ inline auto shallow_copy_vargs(Args&&... args) { | |
| std::forward<Args>(args)...); | ||
| } | ||
|
|
||
| /** | ||
| * Computes theta gradient `f` wrt `theta` and `args...` | ||
| * @note If `Args` contains \ref var types then their adjoints will be | ||
| * calculated as a side effect. | ||
| * @tparam F A functor with `opertor()(Args&&...)` returning a scalar | ||
| * @tparam Theta A class assignable to an Eigen vector type | ||
| * @tparam Stream Type of stream for messages. | ||
| * @tparam Args Type of variadic arguments. | ||
| * @param f Log likelihood function. | ||
| * @param theta Latent Gaussian model. | ||
| * @param msgs Stream for messages. | ||
| * @param args Variadic arguments for the likelihood function. | ||
| */ | ||
| template <typename F, typename Theta, typename Stream, typename... Args, | ||
| require_eigen_vector_vt<std::is_arithmetic, Theta>* = nullptr> | ||
| inline auto theta_grad(F&& f, Theta&& theta, Stream* msgs, Args&&... args) { | ||
| using Eigen::Dynamic; | ||
| using Eigen::Matrix; | ||
| nested_rev_autodiff nested; | ||
| Matrix<var, Dynamic, 1> theta_var = theta; | ||
| var f_var = f(theta_var, args..., msgs); | ||
| grad(f_var.vi_); | ||
| return theta_var.adj().eval(); | ||
| } | ||
|
|
||
| /** | ||
| * Computes likelihood argument gradient of `f` | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so is this only differentiating wrt args but not theta?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes |
||
| * @note If `Args` contains \ref var types then their adjoints will be | ||
| * calculated as a side effect. | ||
| * @tparam F A functor with `opertor()(Args&&...)` returning a scalar | ||
| * @tparam Theta A class assignable to an Eigen vector type | ||
| * @tparam Stream Type of stream for messages. | ||
| * @tparam Args Type of variadic arguments. | ||
| * @param f Log likelihood function. | ||
| * @param theta Latent Gaussian model. | ||
| * @param msgs Stream for messages. | ||
| * @param args Variadic arguments for the likelihood function. | ||
| */ | ||
| template <typename F, typename Theta, typename Stream, typename... Args, | ||
| require_eigen_vector_vt<std::is_arithmetic, Theta>* = nullptr> | ||
| inline void ll_arg_grad(F&& f, Theta&& theta, Stream* msgs, Args&&... args) { | ||
| using Eigen::Dynamic; | ||
| using Eigen::Matrix; | ||
| nested_rev_autodiff nested; | ||
| var f_var = f(theta, args..., msgs); | ||
| grad(f_var.vi_); | ||
| } | ||
|
|
||
| /** | ||
| * Computes negative block diagonal Hessian of `f` wrt`theta` and `args...` | ||
| * @note If `Args` contains \ref var types then their adjoints will be | ||
| * calculated as a side effect. | ||
| * @tparam F A functor with `opertor()(Args&&...)` returning a scalar | ||
| * @tparam Theta A class assignable to an Eigen vector type | ||
| * @tparam Stream Type of stream for messages. | ||
| * @tparam Args Type of variadic arguments. | ||
| * @param f Log likelihood function. | ||
| * @param theta Latent Gaussian model. | ||
| * @param hessian_block_size If the Hessian of the log likelihood function w.r.t | ||
| * the latent Gaussian variable is block-diagonal, | ||
| * size of each block. | ||
| * @param msgs Stream for messages. | ||
| * @param args Variadic arguments for the likelihood function. | ||
| */ | ||
| template <typename F, typename Theta, typename Stream, typename... Args, | ||
| require_eigen_vector_vt<std::is_arithmetic, Theta>* = nullptr> | ||
| inline auto diagonal_hessian(F&& f, Theta&& theta, Stream* msgs, | ||
| Args&&... args) { | ||
| using Eigen::Dynamic; | ||
| using Eigen::Matrix; | ||
| const Eigen::Index theta_size = theta.size(); | ||
| auto v = Eigen::VectorXd::Ones(theta_size); | ||
| Eigen::VectorXd hessian_v = Eigen::VectorXd::Zero(theta_size); | ||
| hessian_times_vector(f, hessian_v, std::forward<Theta>(theta), std::move(v), | ||
| value_of(args)..., msgs); | ||
| return (-hessian_v).eval(); | ||
| } | ||
|
|
||
| /** | ||
| * Computes negative block diagonal Hessian of `f` wrt`theta` and `args...` | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change the description slightly to distinguish from previous function and indicate that one handles the diagonal case and one doesn't. |
||
| * @note If `Args` contains \ref var types then their adjoints will be | ||
| * calculated as a side effect. | ||
| * @tparam F A functor with `opertor()(Args&&...)` returning a scalar | ||
| * @tparam Theta A class assignable to an Eigen vector type | ||
| * @tparam Stream Type of stream for messages. | ||
| * @tparam Args Type of variadic arguments. | ||
| * @param f Log likelihood function. | ||
| * @param theta Latent Gaussian model. | ||
| * @param hessian_block_size If the Hessian of the log likelihood function w.r.t | ||
| * the latent Gaussian variable is block-diagonal, | ||
| * size of each block. | ||
| * @param msgs Stream for messages. | ||
| * @param args Variadic arguments for the likelihood function. | ||
| */ | ||
| template <typename F, typename Theta, typename Stream, typename... Args, | ||
| require_eigen_vector_vt<std::is_arithmetic, Theta>* = nullptr> | ||
| inline auto block_hessian(F&& f, Theta&& theta, | ||
| const Eigen::Index hessian_block_size, Stream* msgs, | ||
| Args&&... args) { | ||
| using Eigen::Dynamic; | ||
| using Eigen::Matrix; | ||
| const Eigen::Index theta_size = theta.size(); | ||
| if (hessian_block_size == 1) { | ||
| auto v = Eigen::VectorXd::Ones(theta_size); | ||
| Eigen::VectorXd hessian_v = Eigen::VectorXd::Zero(theta_size); | ||
| hessian_times_vector(f, hessian_v, std::forward<Theta>(theta), std::move(v), | ||
| value_of(args)..., msgs); | ||
| Eigen::SparseMatrix<double> hessian_theta(theta_size, theta_size); | ||
| hessian_theta.reserve(Eigen::VectorXi::Constant(theta_size, 1)); | ||
| for (Eigen::Index i = 0; i < theta_size; i++) { | ||
| hessian_theta.insert(i, i) = hessian_v(i); | ||
| } | ||
| return (-hessian_theta).eval(); | ||
| } else { | ||
| return (-hessian_block_diag(f, std::forward<Theta>(theta), | ||
| hessian_block_size, value_of(args)..., msgs)) | ||
| .eval(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Computes theta gradient and negative block diagonal Hessian of `f` wrt | ||
| * `theta` and `args...` | ||
|
|
@@ -301,6 +459,79 @@ inline auto diff_eta_implicit(F&& f, V_t&& v, Theta&& theta, Stream* msgs, | |
|
|
||
| } // namespace internal | ||
|
|
||
| /** | ||
| * A wrapper that accepts a tuple as arguments. | ||
| * @tparam F A functor with `opertor()(Args&&...)` returning a scalar | ||
| * @tparam Theta A class assignable to an Eigen vector type | ||
| * @tparam TupleArgs Type of arguments for covariance function. | ||
| * @tparam Stream Type of stream for messages. | ||
| * @param f Log likelihood function. | ||
| * @param theta Latent Gaussian model. | ||
| * @param ll_tup Arguments for likelihood function | ||
| * @param msgs stream messages. | ||
| */ | ||
| template <typename F, typename Theta, typename TupleArgs, typename Stream, | ||
| require_eigen_vector_t<Theta>* = nullptr, | ||
| require_tuple_t<TupleArgs>* = nullptr> | ||
| inline auto theta_grad(F&& f, Theta&& theta, TupleArgs&& ll_tup, | ||
| Stream* msgs = nullptr) { | ||
| return apply( | ||
| [](auto&& f, auto&& theta, auto&& msgs, auto&&... args) { | ||
| return internal::theta_grad(std::forward<decltype(f)>(f), | ||
| std::forward<decltype(theta)>(theta), msgs, | ||
| std::forward<decltype(args)>(args)...); | ||
| }, | ||
| std::forward<TupleArgs>(ll_tup), std::forward<F>(f), | ||
| std::forward<Theta>(theta), msgs); | ||
| } | ||
|
|
||
| template <typename F, typename Theta, typename TupleArgs, typename Stream, | ||
| require_eigen_vector_t<Theta>* = nullptr, | ||
| require_tuple_t<TupleArgs>* = nullptr> | ||
| inline auto ll_arg_grad(F&& f, Theta&& theta, TupleArgs&& ll_tup, | ||
| Stream* msgs = nullptr) { | ||
| return apply( | ||
| [](auto&& f, auto&& theta, auto&& msgs, auto&&... args) { | ||
| return internal::ll_arg_grad(std::forward<decltype(f)>(f), | ||
| std::forward<decltype(theta)>(theta), msgs, | ||
| std::forward<decltype(args)>(args)...); | ||
| }, | ||
| std::forward<TupleArgs>(ll_tup), std::forward<F>(f), | ||
| std::forward<Theta>(theta), msgs); | ||
| } | ||
|
|
||
| template <typename F, typename Theta, typename TupleArgs, typename Stream, | ||
| require_eigen_vector_t<Theta>* = nullptr, | ||
| require_tuple_t<TupleArgs>* = nullptr> | ||
| inline auto diagonal_hessian(F&& f, Theta&& theta, TupleArgs&& ll_tuple, | ||
| Stream* msgs) { | ||
| return apply( | ||
| [](auto&& f, auto&& theta, auto* msgs, auto&&... args) { | ||
| return internal::diagonal_hessian( | ||
| std::forward<decltype(f)>(f), std::forward<decltype(theta)>(theta), | ||
| msgs, std::forward<decltype(args)>(args)...); | ||
| }, | ||
| std::forward<TupleArgs>(ll_tuple), std::forward<F>(f), | ||
| std::forward<Theta>(theta), msgs); | ||
| } | ||
|
|
||
| template <typename F, typename Theta, typename TupleArgs, typename Stream, | ||
| require_eigen_vector_t<Theta>* = nullptr, | ||
| require_tuple_t<TupleArgs>* = nullptr> | ||
| inline auto block_hessian(F&& f, Theta&& theta, | ||
| const Eigen::Index hessian_block_size, | ||
| TupleArgs&& ll_tuple, Stream* msgs) { | ||
| return apply( | ||
| [](auto&& f, auto&& theta, auto hessian_block_size, auto* msgs, | ||
| auto&&... args) { | ||
| return internal::block_hessian( | ||
| std::forward<decltype(f)>(f), std::forward<decltype(theta)>(theta), | ||
| hessian_block_size, msgs, std::forward<decltype(args)>(args)...); | ||
| }, | ||
| std::forward<TupleArgs>(ll_tuple), std::forward<F>(f), | ||
| std::forward<Theta>(theta), hessian_block_size, msgs); | ||
| } | ||
|
|
||
| /** | ||
| * A wrapper that accepts a tuple as arguments. | ||
| * @tparam F A functor with `opertor()(Args&&...)` returning a scalar | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the gradients are not only with respect to
theta(the latent Gaussian variables), we might want to change the nametheta gradientshere to simplygradients of the log likelihood.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm going to add a note to clarify this. If a user passes in
argsthat are reverse mode autodiff types then this would compute the gradients wrt boththetaandargs. But if the user passes inargsthat are not autodiff then we still compute the gradient wrttheta.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The main point of the function is to compute
theta's gradients, which is why I make the nametheta_grad. Theargshaving their gradients calculated are more of a side effect.