diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_1.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_1.pdf index 5bbbdbe..1da86a5 100644 Binary files a/w06_hpo_bo/images/acq_func_images/ei/ei_1.pdf and b/w06_hpo_bo/images/acq_func_images/ei/ei_1.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_2.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_2.pdf deleted file mode 100644 index 9d34384..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/ei/ei_2.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_2a.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_2a.pdf new file mode 100644 index 0000000..7aeaea1 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/ei/ei_2a.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_2b.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_2b.pdf new file mode 100644 index 0000000..0ea8b13 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/ei/ei_2b.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_3.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_3.pdf index d2b0093..ed97447 100644 Binary files a/w06_hpo_bo/images/acq_func_images/ei/ei_3.pdf and b/w06_hpo_bo/images/acq_func_images/ei/ei_3.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_4.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_4.pdf index 23ecab4..71662d1 100644 Binary files a/w06_hpo_bo/images/acq_func_images/ei/ei_4.pdf and b/w06_hpo_bo/images/acq_func_images/ei/ei_4.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_5.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_5.pdf index 7d01096..b307e6e 100644 Binary files a/w06_hpo_bo/images/acq_func_images/ei/ei_5.pdf and b/w06_hpo_bo/images/acq_func_images/ei/ei_5.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_6.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_6.pdf index 795295b..edd43ff 100644 Binary files a/w06_hpo_bo/images/acq_func_images/ei/ei_6.pdf and b/w06_hpo_bo/images/acq_func_images/ei/ei_6.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_7.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_7.pdf index ab66ae6..e0ec68f 100644 Binary files a/w06_hpo_bo/images/acq_func_images/ei/ei_7.pdf and b/w06_hpo_bo/images/acq_func_images/ei/ei_7.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ei/ei_8.pdf b/w06_hpo_bo/images/acq_func_images/ei/ei_8.pdf new file mode 100644 index 0000000..37b5119 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/ei/ei_8.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/es/es_1.pdf b/w06_hpo_bo/images/acq_func_images/es/es_1.pdf index ff6251e..16f4b8d 100644 Binary files a/w06_hpo_bo/images/acq_func_images/es/es_1.pdf and b/w06_hpo_bo/images/acq_func_images/es/es_1.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/es/es_2.pdf b/w06_hpo_bo/images/acq_func_images/es/es_2.pdf index a1f8b70..5253bd1 100644 Binary files a/w06_hpo_bo/images/acq_func_images/es/es_2.pdf and b/w06_hpo_bo/images/acq_func_images/es/es_2.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/es/es_3.pdf b/w06_hpo_bo/images/acq_func_images/es/es_3.pdf index 46076fb..cd5a086 100644 Binary files a/w06_hpo_bo/images/acq_func_images/es/es_3.pdf and b/w06_hpo_bo/images/acq_func_images/es/es_3.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/es/es_4.pdf b/w06_hpo_bo/images/acq_func_images/es/es_4.pdf index ead6d76..4b8c070 100644 Binary files a/w06_hpo_bo/images/acq_func_images/es/es_4.pdf and b/w06_hpo_bo/images/acq_func_images/es/es_4.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/es/es_5.pdf b/w06_hpo_bo/images/acq_func_images/es/es_5.pdf index bb6a70a..ec45d30 100644 Binary files a/w06_hpo_bo/images/acq_func_images/es/es_5.pdf and b/w06_hpo_bo/images/acq_func_images/es/es_5.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/es/es_6.pdf b/w06_hpo_bo/images/acq_func_images/es/es_6.pdf index 2191630..b829349 100644 Binary files a/w06_hpo_bo/images/acq_func_images/es/es_6.pdf and b/w06_hpo_bo/images/acq_func_images/es/es_6.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/kg_1.pdf b/w06_hpo_bo/images/acq_func_images/kg/kg_1.pdf new file mode 100644 index 0000000..d192610 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/kg/kg_1.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/kg_2.pdf b/w06_hpo_bo/images/acq_func_images/kg/kg_2.pdf new file mode 100644 index 0000000..f1912e5 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/kg/kg_2.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/kg_3.pdf b/w06_hpo_bo/images/acq_func_images/kg/kg_3.pdf new file mode 100644 index 0000000..b4a97a4 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/kg/kg_3.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/kg_4.pdf b/w06_hpo_bo/images/acq_func_images/kg/kg_4.pdf new file mode 100644 index 0000000..12af9c7 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/kg/kg_4.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/kg_5.pdf b/w06_hpo_bo/images/acq_func_images/kg/kg_5.pdf new file mode 100644 index 0000000..623ad40 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/kg/kg_5.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_1.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_1.pdf deleted file mode 100644 index 3c41390..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_1.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_1a.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_1a.pdf deleted file mode 100644 index 948a418..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_1a.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_1b.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_1b.pdf deleted file mode 100644 index 0358402..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_1b.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3.pdf deleted file mode 100644 index 4fb221e..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3a.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3a.pdf deleted file mode 100644 index 74e2165..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3a.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3b.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3b.pdf deleted file mode 100644 index 57f5310..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3b.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3c.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3c.pdf deleted file mode 100644 index 3e30877..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_3c.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_KG_2.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_KG_2.pdf deleted file mode 100644 index 73c0fe0..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_KG_2.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_KG_4.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_KG_4.pdf deleted file mode 100644 index 1761d4b..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_KG_4.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_KG_5.pdf b/w06_hpo_bo/images/acq_func_images/kg/look_ahead_KG_5.pdf deleted file mode 100644 index f4160a9..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/kg/look_ahead_KG_5.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/lcb/lcb_1.pdf b/w06_hpo_bo/images/acq_func_images/lcb/lcb_1.pdf index 913d261..cff2c19 100644 Binary files a/w06_hpo_bo/images/acq_func_images/lcb/lcb_1.pdf and b/w06_hpo_bo/images/acq_func_images/lcb/lcb_1.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/lcb/lcb_2.pdf b/w06_hpo_bo/images/acq_func_images/lcb/lcb_2.pdf index adf36d9..bcec7d2 100644 Binary files a/w06_hpo_bo/images/acq_func_images/lcb/lcb_2.pdf and b/w06_hpo_bo/images/acq_func_images/lcb/lcb_2.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/lcb/lcb_3.pdf b/w06_hpo_bo/images/acq_func_images/lcb/lcb_3.pdf deleted file mode 100644 index 4dc96fa..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/lcb/lcb_3.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/lcb/lcb_4.pdf b/w06_hpo_bo/images/acq_func_images/lcb/lcb_4.pdf deleted file mode 100644 index 72c41f1..0000000 Binary files a/w06_hpo_bo/images/acq_func_images/lcb/lcb_4.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_1.pdf b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_1.pdf new file mode 100644 index 0000000..2f56c46 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_1.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_2.pdf b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_2.pdf new file mode 100644 index 0000000..125dcfd Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_2.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_3.pdf b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_3.pdf new file mode 100644 index 0000000..0564722 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_3.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_4.pdf b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_4.pdf new file mode 100644 index 0000000..f4ba258 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_4.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_5.pdf b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_5.pdf new file mode 100644 index 0000000..61bcaac Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_5.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_6.pdf b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_6.pdf new file mode 100644 index 0000000..fe3e1ed Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_6.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_7.pdf b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_7.pdf new file mode 100644 index 0000000..00de135 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/lookahead/look_ahead_7.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/pi/pi_1.pdf b/w06_hpo_bo/images/acq_func_images/pi/pi_1.pdf index 4f985f7..094769b 100644 Binary files a/w06_hpo_bo/images/acq_func_images/pi/pi_1.pdf and b/w06_hpo_bo/images/acq_func_images/pi/pi_1.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/pi/pi_2.pdf b/w06_hpo_bo/images/acq_func_images/pi/pi_2.pdf index f946c1c..84df3b5 100644 Binary files a/w06_hpo_bo/images/acq_func_images/pi/pi_2.pdf and b/w06_hpo_bo/images/acq_func_images/pi/pi_2.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/pi/pi_3.pdf b/w06_hpo_bo/images/acq_func_images/pi/pi_3.pdf index 7d07128..fcab276 100644 Binary files a/w06_hpo_bo/images/acq_func_images/pi/pi_3.pdf and b/w06_hpo_bo/images/acq_func_images/pi/pi_3.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/pi/pi_4.pdf b/w06_hpo_bo/images/acq_func_images/pi/pi_4.pdf index 68ed292..6cd4f3b 100644 Binary files a/w06_hpo_bo/images/acq_func_images/pi/pi_4.pdf and b/w06_hpo_bo/images/acq_func_images/pi/pi_4.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/pi/pi_5.pdf b/w06_hpo_bo/images/acq_func_images/pi/pi_5.pdf index 10cf6c0..b0cb96d 100644 Binary files a/w06_hpo_bo/images/acq_func_images/pi/pi_5.pdf and b/w06_hpo_bo/images/acq_func_images/pi/pi_5.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/pi/pi_6.pdf b/w06_hpo_bo/images/acq_func_images/pi/pi_6.pdf new file mode 100644 index 0000000..20a47a6 Binary files /dev/null and b/w06_hpo_bo/images/acq_func_images/pi/pi_6.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ts/ts_1.pdf b/w06_hpo_bo/images/acq_func_images/ts/ts_1.pdf index fed0ed1..8b2873a 100644 Binary files a/w06_hpo_bo/images/acq_func_images/ts/ts_1.pdf and b/w06_hpo_bo/images/acq_func_images/ts/ts_1.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ts/ts_2.pdf b/w06_hpo_bo/images/acq_func_images/ts/ts_2.pdf index 966f0dc..d2fd276 100644 Binary files a/w06_hpo_bo/images/acq_func_images/ts/ts_2.pdf and b/w06_hpo_bo/images/acq_func_images/ts/ts_2.pdf differ diff --git a/w06_hpo_bo/images/acq_func_images/ts/ts_3.pdf b/w06_hpo_bo/images/acq_func_images/ts/ts_3.pdf index 934f4e9..abe3462 100644 Binary files a/w06_hpo_bo/images/acq_func_images/ts/ts_3.pdf and b/w06_hpo_bo/images/acq_func_images/ts/ts_3.pdf differ diff --git a/w06_hpo_bo/images/intro_images/BOLoop_1.pdf b/w06_hpo_bo/images/intro_images/BOLoop_1.pdf deleted file mode 100644 index a9fca88..0000000 Binary files a/w06_hpo_bo/images/intro_images/BOLoop_1.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/BOLoop_2.pdf b/w06_hpo_bo/images/intro_images/BOLoop_2.pdf deleted file mode 100644 index fd8bffa..0000000 Binary files a/w06_hpo_bo/images/intro_images/BOLoop_2.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/BOLoop_3.pdf b/w06_hpo_bo/images/intro_images/BOLoop_3.pdf deleted file mode 100644 index dc289ee..0000000 Binary files a/w06_hpo_bo/images/intro_images/BOLoop_3.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/BOLoop_4.pdf b/w06_hpo_bo/images/intro_images/BOLoop_4.pdf deleted file mode 100644 index 53abe5c..0000000 Binary files a/w06_hpo_bo/images/intro_images/BOLoop_4.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/BOLoop_5.pdf b/w06_hpo_bo/images/intro_images/BOLoop_5.pdf deleted file mode 100644 index 9ea13cc..0000000 Binary files a/w06_hpo_bo/images/intro_images/BOLoop_5.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/BOLoop_6.pdf b/w06_hpo_bo/images/intro_images/BOLoop_6.pdf deleted file mode 100644 index c3a99d2..0000000 Binary files a/w06_hpo_bo/images/intro_images/BOLoop_6.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/BOLoop_7.pdf b/w06_hpo_bo/images/intro_images/BOLoop_7.pdf deleted file mode 100644 index f6896e9..0000000 Binary files a/w06_hpo_bo/images/intro_images/BOLoop_7.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/IntroPlots_Acqui.pdf b/w06_hpo_bo/images/intro_images/IntroPlots_Acqui.pdf new file mode 100644 index 0000000..d38c81b Binary files /dev/null and b/w06_hpo_bo/images/intro_images/IntroPlots_Acqui.pdf differ diff --git a/w06_hpo_bo/images/intro_images/IntroPlots_Complete.pdf b/w06_hpo_bo/images/intro_images/IntroPlots_Complete.pdf new file mode 100644 index 0000000..2cb129a Binary files /dev/null and b/w06_hpo_bo/images/intro_images/IntroPlots_Complete.pdf differ diff --git a/w06_hpo_bo/images/intro_images/IntroPlots_GP.pdf b/w06_hpo_bo/images/intro_images/IntroPlots_GP.pdf new file mode 100644 index 0000000..1dd1028 Binary files /dev/null and b/w06_hpo_bo/images/intro_images/IntroPlots_GP.pdf differ diff --git a/w06_hpo_bo/images/intro_images/IntroPlots_Iter2.pdf b/w06_hpo_bo/images/intro_images/IntroPlots_Iter2.pdf new file mode 100644 index 0000000..2cb129a Binary files /dev/null and b/w06_hpo_bo/images/intro_images/IntroPlots_Iter2.pdf differ diff --git a/w06_hpo_bo/images/intro_images/IntroPlots_Iter3.pdf b/w06_hpo_bo/images/intro_images/IntroPlots_Iter3.pdf new file mode 100644 index 0000000..96a804d Binary files /dev/null and b/w06_hpo_bo/images/intro_images/IntroPlots_Iter3.pdf differ diff --git a/w06_hpo_bo/images/intro_images/IntroPlots_Iter4.pdf b/w06_hpo_bo/images/intro_images/IntroPlots_Iter4.pdf new file mode 100644 index 0000000..22ac231 Binary files /dev/null and b/w06_hpo_bo/images/intro_images/IntroPlots_Iter4.pdf differ diff --git a/w06_hpo_bo/images/intro_images/IntroPlots_ObjFun.pdf b/w06_hpo_bo/images/intro_images/IntroPlots_ObjFun.pdf new file mode 100644 index 0000000..a2e479b Binary files /dev/null and b/w06_hpo_bo/images/intro_images/IntroPlots_ObjFun.pdf differ diff --git a/w06_hpo_bo/images/intro_images/IntroPlots_Obs.pdf b/w06_hpo_bo/images/intro_images/IntroPlots_Obs.pdf new file mode 100644 index 0000000..a0716d7 Binary files /dev/null and b/w06_hpo_bo/images/intro_images/IntroPlots_Obs.pdf differ diff --git a/w06_hpo_bo/images/intro_images/blackbox_HPO.png b/w06_hpo_bo/images/intro_images/blackbox_HPO.png new file mode 100644 index 0000000..7249eb6 Binary files /dev/null and b/w06_hpo_bo/images/intro_images/blackbox_HPO.png differ diff --git a/w06_hpo_bo/images/intro_images/blackbox_HPO.tex b/w06_hpo_bo/images/intro_images/blackbox_HPO.tex new file mode 100644 index 0000000..d71840d --- /dev/null +++ b/w06_hpo_bo/images/intro_images/blackbox_HPO.tex @@ -0,0 +1,21 @@ +\newcommand{\myblackbox}{\fcolorbox{black}{black}{ + \minipage[t]{\dimexpr0.111\linewidth-2\fboxsep-2\fboxrule\relax} + ~~~\\ + ~~~\\ + ~~~\\ + \endminipage}} + + + \begin{tikzpicture} +\tikzstyle{every node}=[draw,fill=white,minimum width=0cm,thin] +\tikzstyle{every path}=[-latex,ultra thick] +\node (A) [draw=white]{{\Huge{$\conf$}}}; +\node (B) [right=14mm of A,draw=white] {\myblackbox{}}; +\node (C) [right=14mm of B,draw=white] {{\Huge{$f(\conf)$}}}; +%\node (D) [below=7mm of B, align=center, fill=black!10] {\large{Bayesian}\\\large{optimization}}; + +\draw ($(A.east)+(0.2,0.0)$) -- ($(B.west)+(-0.2,0.0)$); +\draw ($(B.east)+(0.2,0.0)$) -- ($(C.west)+(-0.2,0.0)$); +%\draw ($(C.south)+(0.0,-0.2)$) -| ++(0.0,0.0) |- ($(D.east)+(0.2,0.0)$); +%\draw ($(D.west)+(-0.2,0.0)$) |- ++(0.0,0.0) -| ($(A.south)+(0.0,-0.2)$); +\end{tikzpicture} diff --git a/w06_hpo_bo/images/intro_images/branin.png b/w06_hpo_bo/images/intro_images/branin.png deleted file mode 100644 index 820c3cb..0000000 Binary files a/w06_hpo_bo/images/intro_images/branin.png and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/branin_countour.png b/w06_hpo_bo/images/intro_images/branin_countour.png deleted file mode 100644 index 174a7ff..0000000 Binary files a/w06_hpo_bo/images/intro_images/branin_countour.png and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/plot_datapoints.pdf b/w06_hpo_bo/images/intro_images/plot_datapoints.pdf deleted file mode 100644 index fa95b8e..0000000 Binary files a/w06_hpo_bo/images/intro_images/plot_datapoints.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/plot_posterior.pdf b/w06_hpo_bo/images/intro_images/plot_posterior.pdf deleted file mode 100644 index 29a96e0..0000000 Binary files a/w06_hpo_bo/images/intro_images/plot_posterior.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/plot_posterior_1000_sample.pdf b/w06_hpo_bo/images/intro_images/plot_posterior_1000_sample.pdf deleted file mode 100644 index 0de78c0..0000000 Binary files a/w06_hpo_bo/images/intro_images/plot_posterior_1000_sample.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/plot_posterior_100_sample.pdf b/w06_hpo_bo/images/intro_images/plot_posterior_100_sample.pdf deleted file mode 100644 index 94b408a..0000000 Binary files a/w06_hpo_bo/images/intro_images/plot_posterior_100_sample.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/plot_posterior_10_sample.pdf b/w06_hpo_bo/images/intro_images/plot_posterior_10_sample.pdf deleted file mode 100644 index b101096..0000000 Binary files a/w06_hpo_bo/images/intro_images/plot_posterior_10_sample.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/plot_posterior_1_sample.pdf b/w06_hpo_bo/images/intro_images/plot_posterior_1_sample.pdf deleted file mode 100644 index 8b1a38c..0000000 Binary files a/w06_hpo_bo/images/intro_images/plot_posterior_1_sample.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/intro_images/plot_posterior_3_sample.pdf b/w06_hpo_bo/images/intro_images/plot_posterior_3_sample.pdf deleted file mode 100644 index 0d65170..0000000 Binary files a/w06_hpo_bo/images/intro_images/plot_posterior_3_sample.pdf and /dev/null differ diff --git a/w06_hpo_bo/images/success_stories/AutoWEKA_space.png b/w06_hpo_bo/images/success_stories/AutoWEKA_space.png new file mode 100644 index 0000000..f6ec96d Binary files /dev/null and b/w06_hpo_bo/images/success_stories/AutoWEKA_space.png differ diff --git a/w06_hpo_bo/images/success_stories/FB_RGPE.png b/w06_hpo_bo/images/success_stories/FB_RGPE.png new file mode 100644 index 0000000..0310f7a Binary files /dev/null and b/w06_hpo_bo/images/success_stories/FB_RGPE.png differ diff --git a/w06_hpo_bo/images/success_stories/SMAC_citations.png b/w06_hpo_bo/images/success_stories/SMAC_citations.png new file mode 100644 index 0000000..b5dc77d Binary files /dev/null and b/w06_hpo_bo/images/success_stories/SMAC_citations.png differ diff --git a/w06_hpo_bo/images/success_stories/SMAC_github_stats.png b/w06_hpo_bo/images/success_stories/SMAC_github_stats.png new file mode 100644 index 0000000..a6a4b16 Binary files /dev/null and b/w06_hpo_bo/images/success_stories/SMAC_github_stats.png differ diff --git a/w06_hpo_bo/images/success_stories/SMAC_paper.png b/w06_hpo_bo/images/success_stories/SMAC_paper.png new file mode 100644 index 0000000..3000c1f Binary files /dev/null and b/w06_hpo_bo/images/success_stories/SMAC_paper.png differ diff --git a/w06_hpo_bo/images/success_stories/Sum_of_little_blackboxes.png b/w06_hpo_bo/images/success_stories/Sum_of_little_blackboxes.png new file mode 100644 index 0000000..cc958e7 Binary files /dev/null and b/w06_hpo_bo/images/success_stories/Sum_of_little_blackboxes.png differ diff --git a/w06_hpo_bo/images/success_stories/hips_spearmint_git_stats.png b/w06_hpo_bo/images/success_stories/hips_spearmint_git_stats.png new file mode 100644 index 0000000..091b73b Binary files /dev/null and b/w06_hpo_bo/images/success_stories/hips_spearmint_git_stats.png differ diff --git a/w06_hpo_bo/images/success_stories/hyperopt_git_stats.png b/w06_hpo_bo/images/success_stories/hyperopt_git_stats.png new file mode 100644 index 0000000..00cf621 Binary files /dev/null and b/w06_hpo_bo/images/success_stories/hyperopt_git_stats.png differ diff --git a/w06_hpo_bo/images/success_stories/hyperopt_sklearn_git_stats.png b/w06_hpo_bo/images/success_stories/hyperopt_sklearn_git_stats.png new file mode 100644 index 0000000..77aaf69 Binary files /dev/null and b/w06_hpo_bo/images/success_stories/hyperopt_sklearn_git_stats.png differ diff --git a/w06_hpo_bo/images/success_stories/jsnoek_spearmint_git_stats.png b/w06_hpo_bo/images/success_stories/jsnoek_spearmint_git_stats.png new file mode 100644 index 0000000..51dcb5b Binary files /dev/null and b/w06_hpo_bo/images/success_stories/jsnoek_spearmint_git_stats.png differ diff --git a/w06_hpo_bo/images/success_stories/spearmint_alt_stats.png b/w06_hpo_bo/images/success_stories/spearmint_alt_stats.png new file mode 100644 index 0000000..9af2af3 Binary files /dev/null and b/w06_hpo_bo/images/success_stories/spearmint_alt_stats.png differ diff --git a/w06_hpo_bo/images/success_stories/spearmint_stats.png b/w06_hpo_bo/images/success_stories/spearmint_stats.png new file mode 100644 index 0000000..b5920c5 Binary files /dev/null and b/w06_hpo_bo/images/success_stories/spearmint_stats.png differ diff --git a/w06_hpo_bo/images/tpe/tpeiter_1_observations.png b/w06_hpo_bo/images/tpe/tpeiter_1_observations.png new file mode 100644 index 0000000..3a88776 Binary files /dev/null and b/w06_hpo_bo/images/tpe/tpeiter_1_observations.png differ diff --git a/w06_hpo_bo/images/tpe/tpeiter_1_pdfs.png b/w06_hpo_bo/images/tpe/tpeiter_1_pdfs.png new file mode 100644 index 0000000..9ce40b5 Binary files /dev/null and b/w06_hpo_bo/images/tpe/tpeiter_1_pdfs.png differ diff --git a/w06_hpo_bo/images/tpe/tpeiter_2_observations.png b/w06_hpo_bo/images/tpe/tpeiter_2_observations.png new file mode 100644 index 0000000..adc78ab Binary files /dev/null and b/w06_hpo_bo/images/tpe/tpeiter_2_observations.png differ diff --git a/w06_hpo_bo/images/tpe/tpeiter_2_pdfs.png b/w06_hpo_bo/images/tpe/tpeiter_2_pdfs.png new file mode 100644 index 0000000..3634e63 Binary files /dev/null and b/w06_hpo_bo/images/tpe/tpeiter_2_pdfs.png differ diff --git a/w06_hpo_bo/images/tpe/tpeiter_3_observations.png b/w06_hpo_bo/images/tpe/tpeiter_3_observations.png new file mode 100644 index 0000000..1a3a811 Binary files /dev/null and b/w06_hpo_bo/images/tpe/tpeiter_3_observations.png differ diff --git a/w06_hpo_bo/images/tpe/tpeiter_3_pdfs.png b/w06_hpo_bo/images/tpe/tpeiter_3_pdfs.png new file mode 100644 index 0000000..f98140c Binary files /dev/null and b/w06_hpo_bo/images/tpe/tpeiter_3_pdfs.png differ diff --git a/w06_hpo_bo/scripts/bo_configurations.py b/w06_hpo_bo/scripts/bo_configurations.py index 0555869..388d8fd 100644 --- a/w06_hpo_bo/scripts/bo_configurations.py +++ b/w06_hpo_bo/scripts/bo_configurations.py @@ -6,6 +6,7 @@ colors = dict({ 'observations': 'black', 'highlighted_observations': 'green', + 'new_observation': 'red', 'current_incumbent': 'red', 'highlighted_point': 'red', 'gp_mean': '#0F028A', @@ -16,6 +17,8 @@ 'envelope_max_opacity': 0.8, 'minor_tick_highlight': 'red', 'acq_func_fill': 'lightblue', + 'acq_func_intro': 'seagreen', + 'acq_func_intro_fill': 'mediumaquamarine' }) # Various parameters for plotting required by our own code @@ -59,4 +62,17 @@ def f(x): "x": (2, 9), "gp_y": (-3, 3), "acq_y": (0, 5), + "x_intro": (2, 13), + "y_intro": (0, 15) + +} + +zorders = { + 'annotations_low': 20, + 'zone_of_imp': 30, + 'annotations_normal': 40, + 'datapoints': 41, + 'incumbent': 42, + 'annotations_high': 60, + 'legend': 100 } \ No newline at end of file diff --git a/w06_hpo_bo/scripts/bo_intro.py b/w06_hpo_bo/scripts/bo_intro.py index 029630c..f0d1fec 100644 --- a/w06_hpo_bo/scripts/bo_intro.py +++ b/w06_hpo_bo/scripts/bo_intro.py @@ -1,42 +1,221 @@ -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import RBF -from bo_intro_utils import * +import warnings +warnings.filterwarnings('ignore') +import argparse +import logging +from functools import partial + +import numpy as np +from scipy.optimize import minimize +from sklearn.gaussian_process import GaussianProcessRegressor as GPR +from sklearn.gaussian_process.kernels import Matern + from matplotlib import pyplot as plt -plt.style.use(['ggplot', 'seaborn-talk']) +import bo_intro_utils as boplot +from bo_configurations import * + + +SEED = None +INIT_X_PRESENTATION = [4.5, 10] +NUM_ACQ_OPTS = 50 # Number of times the acquisition function is optimized while looking for the next x to sample. +TOGGLE_PRINT = True + + +def initialize_dataset(initial_design, init=None): + """ + Initialize some data to start fitting the GP on. + :param initial_design: Method for initializing the GP, choice between 'uniform', 'random', and 'presentation' + :param init: Number of datapoints to initialize with, if relevant + :return: + """ + + # sample initial query points + if initial_design == 'uniform': + x = np.linspace(bounds["x_intro"][0], bounds["x_intro"][1], init).reshape(-1, 1).tolist() + elif initial_design == 'random': + x = np.random.uniform(bounds["x_intro"][0], bounds["x_intro"][1], init).reshape(-1, 1).tolist() + elif initial_design == 'presentation': + x = np.array(INIT_X_PRESENTATION).reshape(-1, 1).tolist() + + # get corresponding response values + y = list(map(f, x)) + + return x, y + + +def run_bo(acquisition, max_iter, initial_design, acq_add, init=None): + """ + BO + :param acquisition: type of acquisition function to be used + :param max_iter: max number of function calls + :param seed: seed used to keep experiments reproducible + :param initial_design: Method for initializing the GP, choice between 'uniform', 'random', and 'presentation' + :param acq_add: additional parameteres for acquisition function (e.g. kappa for LCB) + :param init: Number of datapoints to initialize GP with. + :return: all evaluated points. + """ + + logging.debug("Running BO with Acquisition Function {0}, maximum iterations {1}, initial design {2}, " + "acq_add {3} and init {4}".format(acquisition, max_iter, initial_design, acq_add, init)) + x, y = initialize_dataset(initial_design=initial_design, init=init) + logging.debug("Initialized dataset with:\nsamples {0}\nObservations {1}".format(x, y)) + + for i in range(1, max_iter): # BO loop + logging.debug('Sample #%d' % (i)) + + # Fit GP to the currently available dataset + gp = GPR(kernel=Matern()) + logging.debug("Fitting GP to\nx: {}\ny:{}".format(x, y)) + + + #Add data to fit a more stable GP + x_add = np.add(x, 0.05) + x_fit = np.append(x, x_add).reshape(-1, 1) + y_fit = y + list(map(f, x_add)) + gp.fit(x_fit, y_fit) # fit the model + + # ----------Plotting calls--------------- + fig, ax1 = plt.subplots(1, 1, squeeze=True) + fig.tight_layout() + ax1.set_xlim(bounds["x_intro"]) + ax1.set_ylim(bounds["y_intro"]) + ax1.set_yticks([]) + ax1.grid() + + boplot.plot_objective_function(ax=ax1, translation=10) + + annotate = False + if i == 1: + annotate = True + + boplot.plot_gp(model=gp, confidence_intervals=[1.0, 2.0, 3.0], ax=ax1, custom_x=x, annotate=annotate, + translation=10) + + mark_incumbent = False + + boplot.mark_observations(X_=x, Y_=y, ax=ax1, mark_incumbent=mark_incumbent, highlight_datapoint=len(y)-1) + + # # Partially initialize the acquisition function to work with the fmin interface + # # (only the x parameter is not specified) + acqui = partial(acquisition, model=gp, eta=min(y), add=acq_add) + + annotate = False + if i == 1: + annotate = True + + boplot.plot_acquisition_function(acquisition, min(y), gp, acq_add, invert=True, ax=ax1, annotate=annotate, + scaling=30) + + # optimize acquisition function, repeat 10 times, use best result + x_ = None + y_ = 10000 + # Feel free to adjust the hyperparameters + for j in range(NUM_ACQ_OPTS): + opt_res = minimize(acqui, np.random.uniform(bounds["x_intro"][0], bounds["x_intro"][1]), method="L-BFGS-B", bounds=[(bounds["x_intro"][0], bounds["x_intro"][1])]) + if opt_res.fun[0] < y_: + x_ = opt_res.x + y_ = opt_res.fun[0] + + + # Update dataset with new observation + x.append(x_) + y.append(f(x_)) + logging.info("After {0}. loop iteration".format(i)) + logging.info("x: {0:.3E}, y: {1:.3E}".format(x_[0], y_)) + + if i==1: + annotate_x = INIT_X_PRESENTATION[0] + ax1.annotate("Observation", xy=(annotate_x, f([annotate_x])+ 10), xytext=(annotate_x - 1, f([annotate_x]) + 14), + arrowprops={'arrowstyle': 'fancy'}, zorder=10, fontsize='x-large') + annotate_x = INIT_X_PRESENTATION[1] + ax1.annotate("Objective function", xy=(annotate_x + 1, f([annotate_x + 1])+ 10), xytext=(annotate_x + 0.25, f([annotate_x + 1])+ 7), + arrowprops={'arrowstyle': 'fancy'}, zorder=10, fontsize='x-large') + # + if i==2: + ax1.annotate("New observation", xy=(new_observation, f(new_observation)+ 10), xytext=(new_observation - 1, f(new_observation) + 6), + arrowprops={'arrowstyle': 'fancy'}, zorder=19, fontsize='x-large') + + ax1.set_xlabel(labels['xlabel']) + + new_observation = x_ + + if TOGGLE_PRINT: + plt.savefig("plot_{}.pdf".format(i), dpi='figure',bbox_inches = 'tight') + else: + plt.show() + # --------------------------------------- + + return y -# Initialize Gaussian Process -kernel = 2.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)) -gp = GaussianProcessRegressor(kernel=kernel) +def main(num_evals, init_size, repetitions, initial_design, acq_add, acquisition): + for i in range(repetitions): + bo_res_1 = run_bo(max_iter=num_evals, init=init_size, initial_design=initial_design, acquisition=acquisition, acq_add=acq_add) -# Generate data and fit GP -noise = np.random.rand(4) -data = np.linspace(0.15, 0.9, 4)[:, np.newaxis] -y = np.sin(data[:, 0]*12)+1/2*np.sin(data[:, 0]*11)+1/2*np.sin(data[:, 0]*23) -gp.fit(data, y) -# Plot datapoints -plt.scatter(data[:, 0], y, c='k', marker='X', s=100, zorder=9) -plt.xlim(0, 1) -plt.ylim(-6, 6) -plt.xlabel("x") -plt.ylabel("f(x)") -plt.savefig("plot_datapoints.pdf", format='pdf') -plt.show() +if __name__ == '__main__': + cmdline_parser = argparse.ArgumentParser('AutoMLLecture') -X_ = np.linspace(0, 1, 600) -y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True) + cmdline_parser.add_argument('-n', '--num_func_evals', + default=10, + help='Number of function evaluations', + type=int) + cmdline_parser.add_argument('-f', '--init_db_size', + default=2, + help='Size of the initial database', + type=int) + cmdline_parser.add_argument('-i', '--initial_design', + default="presentation", + choices=['random', 'uniform', 'presentation'], + help='How to choose first observations.') + cmdline_parser.add_argument('-v', '--verbose', + default=False, + help='verbosity', + action='store_true') + cmdline_parser.add_argument('-a', '--acquisition', + default='EI', + choices=['LCB', 'EI', 'PI'], + help='acquisition function') + cmdline_parser.add_argument('-s', '--seed', + default=15, + help='Which seed to use', + required=False, + type=int) + cmdline_parser.add_argument('-r', '--repetitions', + default=1, + help='Number of repeations for the experiment', + required=False, + type=int) + cmdline_parser.add_argument('-p', '--print', + default=True, + help='Print graphs to file instead of displaying on screen.', + action='store_true') + args, unknowns = cmdline_parser.parse_known_args() + log_lvl = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig(level=log_lvl) -samples = [1, 3, 10, 100, 1000] -seed = 13 + if unknowns: + logging.warning('Found unknown arguments!') + logging.warning(str(unknowns)) + logging.warning('These will be ignored') -# Plot samples from GP -for i, num_sample in enumerate(samples): - plot_sample_gp(num_samples=num_sample, data=data, y=y, X_domain=X_, gp=gp, rnd_state=seed) + # init_size = max(1, int(args.num_func_evals * args.fraction_init)) + # Seed the RNG to obtain reproducible results + SEED = args.seed + np.random.seed(SEED) -# Plot GP posterior -plot_posterior_and_density(data=data, y=y, X_=X_, y_mean=y_mean, y_cov=y_cov, gp=gp, rnd_state=seed) + TOGGLE_PRINT = args.print + if TOGGLE_PRINT: + boplot.enable_printing(figsize=(30, 10)) + else: + boplot.enable_onscreen_display() + main( num_evals=args.num_func_evals, + init_size=args.init_db_size, + repetitions=args.repetitions, + initial_design=args.initial_design, + acquisition=acquisition_functions[args.acquisition], + acq_add=1 + ) \ No newline at end of file diff --git a/w06_hpo_bo/scripts/bo_intro_utils.py b/w06_hpo_bo/scripts/bo_intro_utils.py index ffbc6fa..7f5a547 100644 --- a/w06_hpo_bo/scripts/bo_intro_utils.py +++ b/w06_hpo_bo/scripts/bo_intro_utils.py @@ -1,72 +1,372 @@ -import numpy as np from matplotlib import pyplot as plt -import seaborn as sns -plt.style.use(['ggplot', 'seaborn-talk']) +import numpy as np +import logging + +from bo_configurations import * +from matplotlib import rcParams +from matplotlib.patches import Rectangle + +from scipy.stats import norm + +rcParams["font.size"] = 36 +rcParams["axes.linewidth"] = 3 +rcParams["lines.linewidth"] = 4 +rcParams["lines.markersize"] = 26 +rcParams["legend.loc"] = "best" +rcParams["legend.fontsize"] = 30 +rcParams['axes.labelsize'] = 48 +rcParams['xtick.minor.pad'] = 30.0 +rcParams['xtick.labelsize'] = 48 +#rcParams['ytick.minor.pad'] = -50.0 + + +def enable_printing(figsize=(21, 9)): + rcParams["figure.figsize"] = figsize + rcParams["figure.dpi"] = 300.0 + rcParams["savefig.dpi"] = 'figure' + rcParams["savefig.format"] = 'pdf' + +def enable_onscreen_display(): + rcParams["figure.figsize"] = (16, 9) + rcParams["figure.dpi"] = 100.0 + + +def set_rcparams(**kwargs): + for key, value in kwargs.items(): + rcParams[key] = value + + +def get_plot_domain(precision=None, custom_x=None): + """ + Generates the default domain of configuration values to be plotted. + :param precision: Number of samples per unit interval [0, 1). If None (default), uses params['sample_precision']. + :param custom_x: (Optional) Numpy-array compatible list of x values tha tmust be included in the plot. + :return: A NumPy-array of shape [-1, 1] + """ + if precision is None: + X_ = np.arange(bounds["x_intro"][0], bounds["x_intro"][1], 1 / params['sample_precision']).reshape(-1, 1) + else: + X_ = np.arange(bounds["x_intro"][0], bounds["x_intro"][1], 1 / precision).reshape(-1, 1) + if custom_x is not None: + custom_x = np.array(custom_x).reshape(-1, 1) + logging.debug("Custom x has shape {0}".format(custom_x.shape)) + X_ = np.unique(np.vstack((X_, custom_x))).reshape(-1, 1) + + return X_ + + +# Plot objective function, defined f(x) +def plot_objective_function(ax=None, translation=0): + """ + Plots the underlying true objective function being used for BO. + :param ax: matplotlib.Axes.axes object given by the user, or newly generated for a 1x1 figure if None (default). + :param translation: int for plotting a translated objective function + :return: None if ax was given, otherwise the new matplotlib.Axes.axes object. + """ + return_flag = False + if ax is None: + fig, ax = plt.subplots(1, 1, squeeze=True) + return_flag = True + X_ = get_plot_domain() + ax.plot(X_, np.add(f([X_]), translation), linestyle='--', label="Objective function") + + return ax if return_flag else None + + +def mark_current_incumbent(x, y, invert_y=False, ax=None, translation=0): + """ + Convenience function to mark the current incumbent on the graph. + :param x: Current incumbent's configuration. + :param y: Current incumbent's observed cost. + :param invert_y: Use the negative of the given y value, useful when switching between minimization and maximization. + :param ax: A matplotlib.Axes.axes object on which the graphs are plotted. If None (default), a new 1x1 subplot is + generated and the corresponding axes object is returned. + :param translation: int for translating the coordinates along the y axis + :return: If ax is None, the matplotlib.Axes.axes object on which plotting took place, else None. + """ + + if invert_y: + y = -y + ax.scatter(x, np.add(y, translation), color=colors['current_incumbent'], marker='v', label=labels['incumbent'], zorder=12) + + +def mark_observations(X_, Y_, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=None, translation=10): + """ + Plots the given dataset as data observed thus far, including the current incumbent unless otherwise specified. + :param X_: Configurations. + :param Y_: Observed Costs. + :param mark_incumbent: When True (default), distinctly marks the location of the current incumbent. + :param highlight_datapoint: Optional array of indices of configurations in X_ which will be highlighted. + :param highlight_label: Optional legend label for highlighted datapoints. + :param ax: matplotlib.Axes.axes object given by the user, or newly generated for a 1x1 figure if None (default). + :param translation: int for translating the coordinates along the y axis + :return: None if ax was given, otherwise the new matplotlib.Axes.axes object. + """ + return_flag = False + if ax is None: + fig, ax = plt.subplots(1, 1, squeeze=True) + return_flag = True + + X_ = np.array(X_).reshape(-1, 1) + Y_ = np.array(Y_).reshape(-1, 1) + mask = np.ones(X_.shape[0], dtype=bool) + logging.debug("Marking dataset with X of shape {} and Y of shape {}".format(X_.shape, Y_.shape)) + if mark_incumbent: + incumb_idx = np.argmin(Y_) + mark_current_incumbent(X_[incumb_idx, 0], Y_[incumb_idx, 0], ax=ax) + mask[incumb_idx] = 0 + + if highlight_datapoint is not None: + logging.debug("Placing highlights on labels at indices: {}".format(highlight_datapoint)) + ax.scatter( + X_[highlight_datapoint, 0], + np.add(Y_[highlight_datapoint, 0], translation), + color=colors['new_observation'], + marker='X', + label=highlight_label, + zorder=11 + ) + mask[highlight_datapoint] = 0 + ax.scatter(X_[mask, 0], np.add(Y_[mask, 0], translation), color=colors['observations'], marker='X', label="Observations", zorder=10) + + return ax if return_flag else None + + +def plot_gp_samples(mu, nsamples, precision=None, custom_x=None, show_min=False, ax=None): + """ + Plot a number of samples from a GP. + :param mu: numpy NDArray of shape [-1, nsamples] containing samples from the GP. + :param nsamples: Number of samples to be drawn from the GP. + :param custom_x: (Optional) Numpy-array compatible list of x values tha tmust be included in the plot. + :param precision: Set plotting precision per unit along x-axis. Default params['sample_precision']. + :param show_min: If True, highlights the minima of each sample. Default False. + :param ax: A matplotlib.Axes.axes object on which the graphs are plotted. If None (default), a new 1x1 subplot is + generated and the corresponding axes object is returned. + :return: If ax is None, the matplotlib.Axes.axes object on which plotting took place, else None. + """ + return_flag = False + if ax is None: + fig, ax = plt.subplots(1, 1, squeeze=True) + return_flag = True + + X_ = get_plot_domain(precision=precision, custom_x=custom_x) + logging.debug("Generated x values for plotting of shape {0}".format(X_.shape)) + + logging.debug("Plotting values for x of shape {0}".format(X_.shape)) + + min_idx = np.argmin(mu, axis=0).reshape(-1, nsamples) + + rng = np.random.mtrand._rand + if seed is not None: + rng = np.random.RandomState(seed) + + xmin = [] + mumin = [] + for i in range(nsamples): + ax.plot(X_, mu[:, i], color=rng.rand(3), label="Sample {}".format(i+1), alpha=0.6,) + xmin.append(X_[min_idx[0, i], 0]) + mumin.append(mu[min_idx[0, i], i]) + if show_min: + ax.scatter( + xmin, + mumin, + color=colors['highlighted_observations'], + marker='X', + label='Sample Minima', + zorder=11 + ) + + return ax if return_flag else None + -# Get list containing minimums of GP samples -def get_mins(samples=None): - num_samples = samples.shape[1] - if num_samples > 1: - mins = [] - for sample in range(num_samples): - mins.append(np.argmin(samples[:, sample])) +def plot_gp(model, confidence_intervals=None, type='both', custom_x=None, precision=None, ax=None, translation=0, annotate=False): + """ + Plot a GP's mean and, if required, its confidence intervals. + :param model: GP + :param confidence_intervals: If None (default) no confidence envelope is plotted. If a list of positive values + [k1, k2, ...]is given, the confidence intervals k1*sigma, k2*sigma, ... are plotted. + :param type: 'upper'|'lower'|'both' (default) - Type of confidence bound to plot. + :param custom_x: (Optional) Numpy-array compatible list of x values that must be included in the plot. + :param precision: Set plotting precision per unit along x-axis. Default params['sample_precision']. + :param ax: A matplotlib.Axes.axes object on which the graphs are plotted. If None (default), a new 1x1 subplot is + generated and the corresponding axes object is returned. + :param translation: int for translating the coordinates along the y axis + :param annotate: False, If True annotations are added for the Posterior Mean and Uncertainty + :return: If ax is None, the matplotlib.Axes.axes object on which plotting took place, else None. + """ + return_flag = False + if ax is None: + fig, ax = plt.subplots(1, 1, squeeze=True) + return_flag = True + + X_ = get_plot_domain(precision=precision, custom_x=custom_x) + logging.debug("Generated x values for plotting of shape {0}".format(X_.shape)) + + + def draw_confidence_envelopes(mu, sigma, confidence_intervals): + confidence_intervals = np.array(confidence_intervals) + confidence_intervals.sort() + + # Dynamically generate opacities for each confidence envelope + alphas = np.linspace( + start=colors['envelope_max_opacity'], + stop=colors['envelope_min_opacity'], + num=confidence_intervals.shape[0], + endpoint=False + ) + + get_envelope = { + 'upper': lambda mu, k, sigma: (mu, mu + k * sigma), + 'lower': lambda mu, k, sigma: (mu - k * sigma, mu), + 'both': lambda mu, k, sigma: (mu - k * sigma, mu + k * sigma), + } + + for k, alpha in zip(confidence_intervals, alphas): + lower, upper = get_envelope[type](mu, k, sigma) + ax.fill_between( + X_[:, 0], lower + translation, upper + translation, + facecolor=colors['gp_variance'], alpha=alpha, + label="{0:.2f}-Sigma Confidence Envelope".format(k) + ) + + + if annotate: + annotate_x = [6, 8.5] + X_predict = np.vstack((X_, [[annotate_x[0]]])).reshape(-1, 1) + X_predict = np.vstack((X_predict, [[annotate_x[1]]])).reshape(-1, 1) + mu, sigma = model.predict(X_predict, return_std=True) + else: + mu, sigma = model.predict(X_, return_std=True) + logging.debug("Plotting GP with these values:\nSamples:\t\t{0}\nMeans:\t\t{1}\nSTDs:\t\t{2}".format( + X_, mu, sigma + )) + + # Plot the mean + if annotate: + ax.plot(X_, np.add(mu[:-2], translation), color=colors['gp_mean'], label=labels['gp_mean']) + else: + ax.plot(X_, np.add(mu, translation), color=colors['gp_mean'], label=labels['gp_mean']) + + # If needed, plot the confidence envelope(s) + if confidence_intervals is not None: + if annotate: + draw_confidence_envelopes(mu[:-2], sigma[:-2], confidence_intervals) + else: + draw_confidence_envelopes(mu, sigma, confidence_intervals) + if annotate: + ax.annotate("Posterior mean", xy=(annotate_x[0], mu[-2] + 10), xytext=(annotate_x[0] - 1.15, mu[-2]+ 5), + arrowprops={'arrowstyle': 'fancy'}, zorder=19, fontsize='x-large') + ax.annotate("Posterior uncertainty", xy=(annotate_x[1], mu[-1]- sigma[-1] + 10), xytext=(annotate_x[1] - 0.7, mu[-1] - sigma[-1] + 6), + arrowprops={'arrowstyle': 'fancy'}, zorder=10, fontsize='x-large') + return ax if return_flag else None + + +# Plot acquisition function +def plot_acquisition_function(acquisition, eta, model, add=None, invert=False, ax=None, annotate=False, scaling=1): + """ + Generate a plot to visualize the given acquisition function for the model. + :param acquisition: Acquisition function handle, from bo_configurations.acquisition_functions. + :param eta: Best observed value thus far. + :param model: GP to be used as a model. + :param add: Additional parameters passed to the acquisition function. + :param invert: When True (default), it is assumed that the acquisition function needs to be inverted for plotting. + :param ax: A matplotlib.Axes.axes object on which the graphs are plotted. If None (default), a new 1x1 subplot is + generated and the corresponding axes object is returned. + :param annotate: False, If True annotations are added for the Acquisition Function and Acquisition Function Max + :param scaling: int for plotting a scaled acquisition function + :return: If ax is None, the matplotlib.Axes.axes object on which plotting took place, else None. + """ + return_flag = False + if ax is None: + fig, ax = plt.subplots(1, 1, squeeze=True) + + ax.set_xlim(bounds["x_intro"]) + ax.set_ylim(bounds["y_intro"]) + ax.grid() + ax.set_xlabel(labels['xlabel']) + ax.set_ylabel(labels['acq_ylabel']) + ax.set_title(r"Visualization of {}".format(labels[acquisition]), loc='left') + + return_flag = True + + X_ = get_plot_domain().reshape(-1) + + if annotate: + np.hstack((X_, [6])).reshape(-1, 1) + + + acquisition_fun = acquisition_functions[acquisition](X_, model=model, eta=eta, add=add) + if invert: + acquisition_fun = -acquisition_fun + zipped = list(zip(X_, acquisition_fun)) + zipped.sort(key = lambda t: t[0]) + X_, acquisition_fun = list(zip(*zipped)) + + ax.plot(X_, np.clip(acquisition_fun, a_min=-2, a_max=5)*scaling, color=colors['acq_func_intro'], label=labels[acquisition]) + ax.fill_between(X_, np.clip(acquisition_fun, a_min=-2, a_max=5)*scaling, bounds["y_intro"][0], facecolor=colors['acq_func_intro_fill']) + acq_vals = np.clip(acquisition_fun, a_min=-2, a_max=5)*scaling + best = np.argmax(acq_vals) + + if annotate: + ax.annotate("Acquisition function", xy=(6, acq_vals[-1]), + xytext=(6, acq_vals[-1] + 2), + arrowprops={'arrowstyle': 'fancy'}, zorder=10, fontsize='x-large') + ax.annotate("Acquisition max", xy=(X_[best], acq_vals[best]), + xytext=(X_[best] -2.45, acq_vals[best] + 1), + arrowprops={'arrowstyle': 'fancy'}, zorder=10, fontsize='x-large') + + return ax if return_flag else None + + + +def highlight_configuration(x, label=None, lloc='bottom', ax=None, disable_ticks=False, **kwargs): + """ + Draw a vertical line at the given configuration to highlight it. + :param x: Configuration. + :param label: If None (default), the x-value up to decimal places is placed as a minor tick, otherwise the given + label is used. + :param lloc: Can be either 'top' or 'bottom' (default) to indicate the position of the label on the graph. + :param ax: A matplotlib.Axes.axes object on which the graphs are plotted. If None (default), a new 1x1 subplot is + generated and the corresponding axes object is returned. + :param disable_ticks: Only draw the horizontal line, don't bother with the ticks. + :return: If ax is None, the matplotlib.Axes.axes object on which plotting took place, else None. + """ + return_flag = False + if ax is None: + fig, ax = plt.subplots(1, 1, squeeze=True) + return_flag = True + + # Assume we will recieve x as a view on a numpy array + x = x.reshape(-1)[0] + logging.info("Highlighting configuration at {} with label {}".format(x, label)) + + ax.vlines( + x, ymin=ax.get_ylim()[0], ymax=ax.get_ylim()[1], + colors=colors['minor_tick_highlight'], linestyles='dashed', + ) + + if disable_ticks: + return ax if return_flag else None + + xlabel = "{0:.2f}".format(x) if label is None else label + + if lloc == 'top': + ax.tick_params( + which='minor', + bottom=False, labelbottom=False, + top=True, labeltop=True + ) else: - mins = np.argmin(samples) - return mins - -# Plot sample from posterior and histogram over minimum -def plot_sample_gp(num_samples=10, data=None, y=None, X_domain=None, gp=None, rnd_state=0): - # Plot sample from posterior - - label = ["Sample Curve"] - f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={'height_ratios': [2, 1]}) - y_samples = gp.sample_y(X_domain[:, np.newaxis], num_samples, random_state=rnd_state) - plots = a0.plot(X_domain, y_samples, lw=1, label=label) - a0.scatter(data[:, 0], y, c='k', marker='X', s=100, zorder=9) - a0.legend(plots[:1], label) - a0.set_ylabel("f(x)") - a0.set_xlabel("x") - a0.set_xlim(0, 1) - a0.set_ylim(-6, 6) - - - mins = get_mins(y_samples) - - sns.distplot(X_domain[mins, np.newaxis], hist=True, kde=False, bins=50, norm_hist=True, hist_kws=dict(edgecolor='k', color='#6BAFFC')) - a1.set_xlim(0, 1) - a1.set_ylabel("Pmin(x)") - a1.set_xlabel("x") - plt.yticks([]) - f.tight_layout() - plt.savefig("plot_posterior_%s_sample.pdf" % num_samples, format='pdf') - plt.show() - - -# Plot GP posterior and density over minimum -def plot_posterior_and_density(data=None, y=None, X_=None, y_mean=None, y_cov=None, gp=None, rnd_state=0): - # Plot GP posterior - f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={'height_ratios': [2, 1]}) - uncertainty = 3.5 * np.sqrt(np.diag(y_cov)) - a0.plot(X_, y_mean, color='#0F028A', linewidth=2, alpha=0.8, label="GP mean") - a0.fill_between(X_, y_mean - uncertainty, y_mean + uncertainty, alpha=0.3, facecolor='lightblue', edgecolor='k', label="GP variance") - a0.scatter(data[:, 0], y, c='k', marker='X', s=100, zorder=9) - a0.set_xlim(0, 1) - a0.set_ylim(-6, 6) - a0.set_ylabel("f(x)") - a0.set_xlabel("x") - a0.legend() - - # Plot density over minimums from samples - y_samples = gp.sample_y(X_[:, np.newaxis], 1000, rnd_state) - mins = get_mins(y_samples) - - sns.distplot(X_[mins, np.newaxis], kde=True, hist=False, bins=50, color='#6BAFFC') - a1.set_ylabel('Pmin(x)') - a1.set_xlabel("x") - - a1.set_xlim(0, 1) - plt.yticks([]) - f.tight_layout() - plt.savefig("plot_posterior.pdf", format='pdf') - plt.show() + ax.tick_params( + which='minor', + bottom=True, labelbottom=True, + top=False, labeltop=False + ) + + label_props = {'color': colors['minor_tick_highlight'], **kwargs} + ax.set_xticks([x], minor=True) + ax.set_xticklabels([xlabel], label_props, minor=True) + + return ax if return_flag else None + diff --git a/w06_hpo_bo/scripts/bo_loop_acq_functions.py b/w06_hpo_bo/scripts/bo_loop_acq_functions.py index 0565319..7e44a77 100644 --- a/w06_hpo_bo/scripts/bo_loop_acq_functions.py +++ b/w06_hpo_bo/scripts/bo_loop_acq_functions.py @@ -35,24 +35,27 @@ def EI(x, model, eta, add=None, plotting=False): mu, sigma = model.predict(x, return_std=True) with np.errstate(divide='warn'): - improvement = mu - eta + improvement = eta - mu #mu - eta Z = improvement/sigma ei = improvement * norm.cdf(Z) + sigma * norm.pdf(Z) ei[sigma == 0.0] = 0.0 # return ei - return -ei if plotting else ei + return ei if plotting else -ei -def LCB(x, model, eta, add): +def LCB(x, model, eta, add, plotting=False): """ - Lower Confidence Bound, returns a value for the minimizer + Upper Confidence Bound :param x: point to determine the acquisition value :param model: GP to predict target function value :param eta: best so far seen value :param add: additional parameters necessary for the function (kappa) + :param plotting: flag to fulfill fmin interface / show plots with functions to be maximized. :return: positive LCB value for plotting, negative for the optimizer. """ x = np.array([x]).reshape([-1, 1]) mu, sigma = model.predict(x, return_std=True) - lcb = mu - add * sigma + kappa = np.sqrt(add) + lcb = mu - kappa * sigma + # return -lcb if plotting else lcb return lcb \ No newline at end of file diff --git a/w06_hpo_bo/scripts/bo_loop_mwe.py b/w06_hpo_bo/scripts/bo_loop_mwe.py deleted file mode 100644 index c8e98b7..0000000 --- a/w06_hpo_bo/scripts/bo_loop_mwe.py +++ /dev/null @@ -1,216 +0,0 @@ -import warnings -warnings.filterwarnings('ignore') -import argparse -import logging -from functools import partial - -import numpy as np -from scipy.optimize import minimize -from sklearn.gaussian_process import GaussianProcessRegressor as GPR -from sklearn.gaussian_process.kernels import Matern - -from matplotlib import pyplot as plt - -import bo_plot_utils as boplot -from bo_configurations import * - - -SEED = None -INIT_X_PRESENTATION = [3, 4, 4.6, 4.8, 5, 9.4, 10, 12.7] -NUM_ACQ_OPTS = 10 # Number of times the acquisition function is optimized while looking for the next x to sample. -TOGGLE_PRINT = False - - -def initialize_dataset(initial_design, init=None): - """ - Initialize some data to start fitting the GP on. - :param initial_design: Method for initializing the GP, choice between 'uniform', 'random', and 'presentation' - :param init: Number of datapoints to initialize with, if relevant - :return: - """ - - # sample initial query points - if initial_design == 'uniform': - x = np.linspace(bounds["x"][0], bounds["x"][1], init).reshape(-1, 1).tolist() - elif initial_design == 'random': - x = np.random.uniform(bounds["x"][0], bounds["x"][1], init).reshape(-1, 1).tolist() - elif initial_design == 'presentation': - x = np.array(INIT_X_PRESENTATION).reshape(-1, 1).tolist() - - # get corresponding response values - y = list(map(f, x)) - - return x, y - - -def run_bo(acquisition, max_iter, initial_design, acq_add, init=None): - """ - BO - :param acquisition: type of acquisition function to be used - :param max_iter: max number of function calls - :param seed: seed used to keep experiments reproducible - :param initial_design: Method for initializing the GP, choice between 'uniform', 'random', and 'presentation' - :param acq_add: additional parameteres for acquisition function (e.g. kappa for LCB) - :param init: Number of datapoints to initialize GP with. - :return: all evaluated points. - """ - - logging.debug("Running BO with Acquisition Function {0}, maximum iterations {1}, initial design {2}, " - "acq_add {3} and init {4}".format(acquisition, max_iter, initial_design, acq_add, init)) - x, y = initialize_dataset(initial_design=initial_design, init=init) - logging.debug("Initialized dataset with:\nsamples {0}\nObservations {1}".format(x, y)) - - for i in range(1, max_iter): # BO loop - logging.debug('Sample #%d' % (i)) - - # Fit GP to the currently available dataset - gp = GPR(kernel=Matern()) - logging.debug("Fitting GP to\nx: {}\ny:{}".format(x, y)) - gp.fit(x, y) # fit the model - - # ----------Plotting calls--------------- - fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) - fig.tight_layout() - ax1.set_xlim(bounds["x"]) - ax1.set_ylim(bounds["gp_y"]) - ax1.grid() - ax2.set_xlim(bounds["x"]) - ax2.set_ylim(bounds["acq_y"]) - ax2.grid() - boplot.plot_objective_function(ax=ax1) - boplot.plot_gp(model=gp, confidence_intervals=[1.0, 2.0], ax=ax1, custom_x=x) - boplot.mark_observations(X_=x, Y_=y, ax=ax1) - # --------------------------------------- - - # noinspection PyStringFormat - logging.debug("Model fit to dataset.\nOriginal Inputs: {0}\nOriginal Observations: {1}\n" - "Predicted Means: {2}\nPredicted STDs: {3}".format(x, y, *(gp.predict(x, return_std=True)))) - - # Partially initialize the acquisition function to work with the fmin interface - # (only the x parameter is not specified) - acqui = partial(acquisition, model=gp, eta=min(y), add=acq_add) - - boplot.plot_acquisition_function(acquisition, min(y), gp, acq_add, invert=True, ax=ax2) - - # optimize acquisition function, repeat 10 times, use best result - x_ = None - y_ = 10000 - # Feel free to adjust the hyperparameters - for j in range(NUM_ACQ_OPTS): - opt_res = minimize(acqui, np.random.uniform(bounds["x"][0], bounds["x"][1]), - #bounds=bounds["x"], - options={'maxfun': 20, 'maxiter': 20}, method="L-BFGS-B") - if opt_res.fun[0] < y_: - x_ = opt_res.x - y_ = opt_res.fun[0] - - # ----------Plotting calls--------------- - boplot.highlight_configuration(x_, ax=ax1) - boplot.highlight_configuration(x_, ax=ax2) - # --------------------------------------- - - # Update dataset with new observation - x.append(x_) - y.append(f(x_)) - - logging.info("After {0}. loop iteration".format(i)) - logging.info("x: {0:.3E}, y: {1:.3E}".format(x_[0], y_)) - - - - # ----------Plotting calls--------------- - for ax in (ax1, ax2): - ax.legend() - ax.set_xlabel(labels['xlabel']) - - ax1.set_ylabel(labels['gp_ylabel']) - ax1.set_title("Visualization of GP", loc='left') - - ax2.set_title("Visualization of Acquisition Function", loc='left') - ax2.set_ylabel(labels['acq_ylabel']) - if TOGGLE_PRINT: - plt.savefig("plot_{}.pdf".format(i), dpi='figure') - else: - plt.show() - # --------------------------------------- - - return y - - - -def main(num_evals, init_size, repetitions, initial_design, acq_add, acquisition): - for i in range(repetitions): - bo_res_1 = run_bo(max_iter=num_evals, init=init_size, initial_design=initial_design, acquisition=acquisition, acq_add=acq_add) - - - -if __name__ == '__main__': - cmdline_parser = argparse.ArgumentParser('AutoMLLecture') - - cmdline_parser.add_argument('-n', '--num_func_evals', - default=5, - help='Number of function evaluations', - type=int) - cmdline_parser.add_argument('-f', '--init_db_size', - default=4, - help='Size of the initial database', - type=int) - cmdline_parser.add_argument('-i', '--initial_design', - default="random", - choices=['random', 'uniform', 'presentation'], - help='How to choose first observations.') - cmdline_parser.add_argument('-v', '--verbose', - default=False, - help='verbosity', - action='store_true') - cmdline_parser.add_argument('-a', '--acquisition', - default='LCB', - choices=['LCB', 'EI', 'PI'], - help='acquisition function') - cmdline_parser.add_argument('-s', '--seed', - default=15, - help='Which seed to use', - required=False, - type=int) - cmdline_parser.add_argument('-r', '--repetitions', - default=1, - help='Number of repeations for the experiment', - required=False, - type=int) - cmdline_parser.add_argument('-p', '--print', - default=False, - help='Print graphs to file instead of displaying on screen.', - action='store_true') - - args, unknowns = cmdline_parser.parse_known_args() - log_lvl = logging.DEBUG if args.verbose else logging.INFO - logging.basicConfig(level=log_lvl) - - if unknowns: - logging.warning('Found unknown arguments!') - logging.warning(str(unknowns)) - logging.warning('These will be ignored') - - # init_size = max(1, int(args.num_func_evals * args.fraction_init)) - # Seed the RNG to obtain reproducible results - SEED = args.seed - np.random.seed(SEED) - - TOGGLE_PRINT = args.print - if TOGGLE_PRINT: - boplot.enable_printing() - else: - boplot.enable_onscreen_display() - - - #init_size = max(1, int(args.num_func_evals * args.fraction_init)) - - main( num_evals=args.num_func_evals, - # init_size=init_size, - init_size=args.init_db_size, - repetitions=args.repetitions, - initial_design=args.initial_design, - acquisition=acquisition_functions[args.acquisition], - # seed=args.seed, - acq_add=1 - ) \ No newline at end of file diff --git a/w06_hpo_bo/scripts/bo_plot_utils.py b/w06_hpo_bo/scripts/bo_plot_utils.py index bc3fd0c..72091ed 100644 --- a/w06_hpo_bo/scripts/bo_plot_utils.py +++ b/w06_hpo_bo/scripts/bo_plot_utils.py @@ -4,45 +4,68 @@ from bo_configurations import * from matplotlib import rcParams +from matplotlib import rc from matplotlib.patches import Rectangle from scipy.stats import norm -rcParams["font.size"] = 32 -rcParams["axes.linewidth"] = 3 -rcParams["lines.linewidth"] = 4 -rcParams["lines.markersize"] = 26 -rcParams["legend.loc"] = "lower right" -rcParams["legend.fontsize"] = 26 -rcParams['axes.labelsize'] = 36 -rcParams['xtick.minor.pad'] = 30.0 -#rcParams['ytick.minor.pad'] = -50.0 +RC_FONT = { + "size": 46 +} +rc("font", **RC_FONT) +RC_AXES = { + "linewidth": 3, + "labelsize": 36 +} +rc("axes", **RC_AXES) + +RC_LINES = { + "linewidth": 4, + "markersize": 26 +} +rc("lines", **RC_LINES) + +RC_LEGEND = { + "loc": "lower right", + "fontsize": 26 +} +rc("legend", **RC_LEGEND) +rc(("xtick.minor", "ytick.minor"), pad=10.0) +rc(("xtick", "ytick"), labelsize=32.0) + +# To be implemented when needed in order to keep track of multiple highlighted minor ticks +highlighted_yticks = [] +highlighted_xticks = [] def enable_printing(): - rcParams["figure.figsize"] = (21, 9) - rcParams["figure.dpi"] = 300.0 - rcParams["savefig.dpi"] = 'figure' - rcParams["savefig.format"] = 'pdf' + rc("figure", figsize=(21, 9), dpi=300.0) + rc("savefig", dpi='figure', format='pdf', directory='./outputs') + def enable_onscreen_display(): - rcParams["figure.figsize"] = (16, 9) - rcParams["figure.dpi"] = 100.0 + rc("figure", figsize=(16, 9), dpi=100.0) def set_rcparams(**kwargs): + '''***DEPTRECATED***''' for key, value in kwargs.items(): rcParams[key] = value -def annotate_y_edge(label, xy, ax, align='right'): +def set_rc(group, **kwargs): + rc(group, **kwargs) + + +def annotate_y_edge(label, xy, ax, align='right', yoffset=1.0): """ Place an annotation beneath a horizontal bar, between a given point and either of the left or right edges. :param label: Text to annotate with. :param xy: Given xy-coordinates. :param ax: matplotlib.Axes.axes object given by the user :param align: 'left' or 'right' (default) edge to use. + :param yoffset: Shifts label position towards the top (negative offset) or bottom (positive offset) from xy. :return: None. """ @@ -53,10 +76,11 @@ def annotate_y_edge(label, xy, ax, align='right'): # textxy = ax.transData.transform([x, xy[1]]) # textxy = ax.transData.inverted().transform((textxy[0], textxy[1] - 2 * rcParams["font.size"])) - textxy = (x, xy[1] - (ax.get_ylim()[1] - ax.get_ylim()[0]) / 10) + textxy = (x, xy[1] - yoffset * (ax.get_ylim()[1] - ax.get_ylim()[0]) / 10) # logging.info("Placing text at {}".format(textxy)) - ax.annotate(s=label, xy=textxy, color=colors['minor_tick_highlight'], horizontalalignment='center', zorder=10) + ax.annotate(s=label, xy=textxy, color=colors['minor_tick_highlight'], horizontalalignment='center', + zorder=zorders['annotations_normal']) def annotate_x_edge(label, xy, ax, align='bottom', offset_param=1.5): @@ -80,7 +104,8 @@ def annotate_x_edge(label, xy, ax, align='bottom', offset_param=1.5): textxy = (xy[0] - 0.1, y) # logging.info("Placing text at {}".format(textxy)) - ax.annotate(s=label, xy=textxy, color=colors['minor_tick_highlight'], horizontalalignment='right', zorder=10) + ax.annotate(s=label, xy=textxy, color=colors['minor_tick_highlight'], horizontalalignment='right', + zorder=zorders['annotations_normal']) def get_plot_domain(precision=None, custom_x=None): @@ -132,7 +157,8 @@ def mark_current_incumbent(x, y, invert_y=False, ax=None): if invert_y: y = -y - ax.scatter(x, y, color=colors['current_incumbent'], marker='v', label=labels['incumbent'], zorder=12) + ax.scatter(x, y, color=colors['current_incumbent'], marker='v', label=labels['incumbent'], + zorder=zorders['incumbent']) def mark_observations(X_, Y_, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=None): @@ -168,10 +194,11 @@ def mark_observations(X_, Y_, mark_incumbent=True, highlight_datapoint=None, hig color=colors['highlighted_observations'], marker='X', label=highlight_label, - zorder=11 + zorder=zorders['datapoints'] + 1 ) mask[highlight_datapoint] = 0 - ax.scatter(X_[mask, 0], Y_[mask, 0], color=colors['observations'], marker='X', label="Observations", zorder=10) + ax.scatter(X_[mask, 0], Y_[mask, 0], color=colors['observations'], marker='X', label="Observations", + zorder=zorders['datapoints']) return ax if return_flag else None @@ -208,7 +235,7 @@ def plot_gp_samples(mu, nsamples, precision=None, custom_x=None, show_min=False, xmin = [] mumin = [] for i in range(nsamples): - ax.plot(X_, mu[:, i], color=rng.rand(3), label="Sample {}".format(i+1), alpha=0.6,) + ax.plot(X_, mu[:, i], color=rng.rand(3), label="Sample {}".format(i + 1), alpha=0.6, ) xmin.append(X_[min_idx[0, i], 0]) mumin.append(mu[min_idx[0, i], i]) if show_min: @@ -218,13 +245,12 @@ def plot_gp_samples(mu, nsamples, precision=None, custom_x=None, show_min=False, color=colors['highlighted_observations'], marker='X', label='Sample Minima', - zorder=11 + zorder=zorders['datapoints'] ) return ax if return_flag else None - def plot_gp(model, confidence_intervals=None, type='both', custom_x=None, precision=None, ax=None): """ Plot a GP's mean and, if required, its confidence intervals. @@ -246,7 +272,6 @@ def plot_gp(model, confidence_intervals=None, type='both', custom_x=None, precis X_ = get_plot_domain(precision=precision, custom_x=custom_x) logging.debug("Generated x values for plotting of shape {0}".format(X_.shape)) - def draw_confidence_envelopes(mu, sigma, confidence_intervals): confidence_intervals = np.array(confidence_intervals) confidence_intervals.sort() @@ -270,10 +295,9 @@ def draw_confidence_envelopes(mu, sigma, confidence_intervals): ax.fill_between( X_[:, 0], lower, upper, facecolor=colors['gp_variance'], alpha=alpha, - label="{0:.2f}-Sigma Confidence Envelope".format(k) + label="{0:.1f}x Sigma Confidence Envelope".format(k) ) - mu, sigma = model.predict(X_, return_std=True) logging.debug("Plotting GP with these values:\nSamples:\t\t{0}\nMeans:\t\t{1}\nSTDs:\t\t{2}".format( X_, mu, sigma @@ -319,7 +343,7 @@ def plot_acquisition_function(acquisition, eta, model, add=None, ax=None): acquisition_fun = acquisition_functions[acquisition](X_, model=model, eta=eta, add=add) acquisition_fun = -acquisition_fun zipped = list(zip(X_, acquisition_fun)) - zipped.sort(key = lambda t: t[0]) + zipped.sort(key=lambda t: t[0]) X_, acquisition_fun = list(zip(*zipped)) ax.plot(X_, acquisition_fun, color=colors['acq_fun'], label=labels[acquisition]) @@ -333,37 +357,40 @@ def plot_acquisition_function(acquisition, eta, model, add=None, ax=None): # plt.clf() -def highlight_configuration(x, label=None, lloc='bottom', ax=None, disable_ticks=False, **kwargs): +def highlight_configuration(x, label=None, lloc='bottom', ax=None, disable_ticks=False, append_ticks=False, **kwargs): """ Draw a vertical line at the given configuration to highlight it. - :param x: Configuration. + :param x: Configurations to be highlighted. :param label: If None (default), the x-value up to decimal places is placed as a minor tick, otherwise the given - label is used. + labels are used. Assumed to have a one-to-one correspondence with the given configurations. :param lloc: Can be either 'top' or 'bottom' (default) to indicate the position of the label on the graph. :param ax: A matplotlib.Axes.axes object on which the graphs are plotted. If None (default), a new 1x1 subplot is generated and the corresponding axes object is returned. :param disable_ticks: Only draw the horizontal line, don't bother with the ticks. + :param append_ticks: When True, adds the given ticks to those already present. Otherwise, drops the old yticks. + Default is False. :return: If ax is None, the matplotlib.Axes.axes object on which plotting took place, else None. """ + global highlighted_xticks return_flag = False if ax is None: fig, ax = plt.subplots(1, 1, squeeze=True) return_flag = True # Assume we will recieve x as a view on a numpy array - x = x.reshape(-1)[0] - logging.info("Highlighting configuration at {} with label {}".format(x, label)) + xvals = x.reshape(-1) + logging.info("Highlighting configuration at {} with label {}".format(xvals, label)) ax.vlines( - x, ymin=ax.get_ylim()[0], ymax=ax.get_ylim()[1], + xvals, ymin=ax.get_ylim()[0], ymax=ax.get_ylim()[1], colors=colors['minor_tick_highlight'], linestyles='dashed', ) if disable_ticks: + rc("xtick.minor", visible=False) + highlighted_xticks = [] return ax if return_flag else None - xlabel = "{0:.2f}".format(x) if label is None else label - if lloc == 'top': ax.tick_params( which='minor', @@ -377,39 +404,62 @@ def highlight_configuration(x, label=None, lloc='bottom', ax=None, disable_ticks top=False, labeltop=False ) + if label is None: + label = ["{0:.2f}".format(val) for val in xvals] + elif type(label) is str: + label = [label] + else: + label = [l for l in label] + + new_xticks = [(val, l) for val, l in zip(xvals, label)] + + if append_ticks: + highlighted_xticks += new_xticks + else: + highlighted_xticks = new_xticks + + highlighted_xticks.sort(key=lambda e: e[0]) + logging.info(f"Placing minor xticks:{highlighted_xticks}") label_props = {'color': colors['minor_tick_highlight'], **kwargs} - ax.set_xticks([x], minor=True) - ax.set_xticklabels([xlabel], label_props, minor=True) + ax.set_xticks([val[0] for val in highlighted_xticks], minor=True) + ax.set_xticklabels([val[1] for val in highlighted_xticks], label_props, minor=True) return ax if return_flag else None -def highlight_output(y, label=None, lloc='left', ax=None, disable_ticks=False, **kwargs): + +def highlight_output(y, label=None, lloc='left', ax=None, disable_ticks=False, append_ticks=False, **kwargs): """ - Draw a horizontal line at the given y-value to highlight it. - :param y: y-value to be highlighted. + Draw a horizontal line at the given y-values to highlight them. + :param y: y-values to be highlighted. :param label: If None (default), the y-value up to decimal places is placed as a minor tick, otherwise the given - label is used. + labels are used. Assumed to have a one-to-one correspondence with the given y-values. :param lloc: Can be either 'left' (default) or 'right' to indicate the position of the label on the graph. :param ax: A matplotlib.Axes.axes object on which the graphs are plotted. If None (default), a new 1x1 subplot is generated and the corresponding axes object is returned. :param disable_ticks: Only draw the horizontal line, don't bother with the ticks. + :param append_ticks: When True, adds the given ticks to those already present. Otherwise, drops the old yticks. + Default is False. :return: If ax is None, the matplotlib.Axes.axes object on which plotting took place, else None. """ return_flag = False + global highlighted_yticks if ax is None: fig, ax = plt.subplots(1, 1, squeeze=True) return_flag = True # Assume we will recieve y as a view on a numpy array - y = y.reshape(-1)[0] + yvals = y.reshape(-1) - ax.hlines( - y, - xmin=ax.get_xlim()[0], xmax=ax.get_xlim()[1], - colors=colors['minor_tick_highlight'], linestyles='dashed' - ) + for val in yvals: + ax.hlines( + val, + xmin=ax.get_xlim()[0], xmax=ax.get_xlim()[1], + colors=colors['minor_tick_highlight'], linestyles='dashed' + ) if disable_ticks: + rc("ytick.minor", visible=False) + highlighted_yticks = [] return ax if return_flag else None if lloc == 'right': @@ -425,13 +475,28 @@ def highlight_output(y, label=None, lloc='left', ax=None, disable_ticks=False, * right=False, labelright=False ) - ylabel = "{0:.2f}".format(y) if label is None else label + if label is None: + label = ["{0:.2f}".format(val) for val in yvals] + elif type(label) is str: + label = [label] + else: + label = [l for l in label] + + new_yticks = [(val, l) for val, l in zip(yvals, label)] + if append_ticks: + highlighted_yticks += new_yticks + else: + highlighted_yticks = new_yticks + highlighted_yticks.sort(key=lambda e: e[0]) + logging.info(f"Placing minor yticks:{highlighted_yticks}") + # ylabel = "{0:.2f}".format(y) if label is None else label label_props = {'color': colors['minor_tick_highlight'], **kwargs} - ax.set_yticks([y], minor=True) - ax.set_yticklabels([ylabel], label_props, minor=True) + ax.set_yticks([val[0] for val in highlighted_yticks], minor=True) + ax.set_yticklabels([val[1] for val in highlighted_yticks], label_props, minor=True) return ax if return_flag else None + def darken_graph(y, ax): """ Darken the graph above a certain y-value. @@ -445,15 +510,16 @@ def darken_graph(y, ax): rectheight = ax.get_ylim()[1] - y rect = Rectangle( recto, rectwidth, rectheight, - fill=True, alpha=0.75, facecolor='white', zorder=8, linewidth=rcParams['lines.linewidth'], edgecolor='grey' + fill=True, alpha=0.75, facecolor='white', + zorder=zorders['zone_of_imp'], linewidth=rcParams['lines.linewidth'], + edgecolor=None # 'grey' ) ax.add_patch(rect) return -def draw_vertical_normal(gp, incumbenty, ax, xtest=0.0, step=0.01, - xlim=2.0, xscale=1.0, yscale=1.0): - +def draw_vertical_normal(gp, incumbenty, ax, xtest=0.0, step=0.01, xscale=1.0, yscale=1.0, fill=True, + draw_domain=True): # Generate a normal pdf centered at xtest ytest_mean, ytest_cov = gp.predict([[xtest]], return_cov=True) mu = ytest_mean[0] @@ -465,7 +531,7 @@ def draw_vertical_normal(gp, incumbenty, ax, xtest=0.0, step=0.01, # print("ytest mean:{}, cov:{}".format(ytest_mean, ytest_cov)) # Generate a Normal distribution centered around it's mean. - norm_x = np.arange(mu - xlim, mu + xlim + step, step) + norm_x = np.arange(mu + bounds['gp_y'][0], mu + bounds['gp_y'][1], step) norm_y = norm.pdf(norm_x, mu, sigma) * yscale logging.info("Min of normal_y is: {}\nMean of normal_y is:{}".format(np.min(norm_y), np.mean(norm_y))) @@ -480,11 +546,15 @@ def draw_vertical_normal(gp, incumbenty, ax, xtest=0.0, step=0.01, # vcurve_x = norm_x + xtest # vcurve_y = norm_y + mu - ax.plot(xtest, mu, color='red', marker='o', markersize=20, zorder=14) - ax.vlines(xtest, ymin=ax.get_ylim()[0], ymax=ax.get_ylim()[1], colors='black', linestyles='dashed', zorder=9) - ax.plot(vcurve_x, vcurve_y, color='black', zorder=9) - fill_args = np.where(vcurve_y < incumbenty) - ax.fill_betweenx(vcurve_y[fill_args], xtest, vcurve_x[fill_args], alpha=1.0, facecolor='darkgreen', zorder=14) + ax.plot(xtest, mu, color='red', marker='o', markersize=20, zorder=zorders['annotations_high']) + if draw_domain: + ax.vlines(xtest, ymin=ax.get_ylim()[0], ymax=ax.get_ylim()[1], colors='black', linestyles='dashed', + zorder=zorders['zone_of_imp'] + 1) + ax.plot(vcurve_x, vcurve_y, color='black', zorder=zorders['zone_of_imp'] + 1) + if fill: + fill_args = np.where(vcurve_y < incumbenty) + ax.fill_betweenx(vcurve_y[fill_args], xtest, vcurve_x[fill_args], alpha=1.0, facecolor='darkgreen', + zorder=zorders['annotations_high'] - 5) # ann_x = xtest # ann_y = mu @@ -496,4 +566,4 @@ def draw_vertical_normal(gp, incumbenty, ax, xtest=0.0, step=0.01, # arrowprops={'arrowstyle': 'fancy'}, # weight='heavy', zorder=15) - return (vcurve_x, vcurve_y, mu) + return vcurve_x, vcurve_y, mu diff --git a/w06_hpo_bo/scripts/ei_plots.py b/w06_hpo_bo/scripts/ei_plots.py index a30d0b8..8a395cd 100644 --- a/w06_hpo_bo/scripts/ei_plots.py +++ b/w06_hpo_bo/scripts/ei_plots.py @@ -2,10 +2,7 @@ warnings.filterwarnings('ignore') import argparse import logging -from functools import partial - -import numpy as np -from scipy.optimize import minimize +import os.path from sklearn.gaussian_process import GaussianProcessRegressor as GPR from sklearn.gaussian_process.kernels import Matern @@ -18,12 +15,15 @@ SEED = None TOGGLE_PRINT = False INIT_X_PRESENTATION = [2.5, 3.5, 5.5, 7, 9] +OUTPUT_DIR = os.path.abspath("./outputs/ei") + bounds["x"] = (2, 13) bounds["gp_y"] = (-5, 5) -# boplot.set_rcparams(**{"legend.loc": "lower left"}) -labels["xlabel"] = "$\lambda'$" -labels["gp_ylabel"] = "$c(\lambda')$" +boplot.set_rc("savefig", directory=OUTPUT_DIR) + +labels["xlabel"] = "$\lambda$" +labels["gp_ylabel"] = "" def initialize_dataset(initial_design, init=None): """ @@ -72,15 +72,6 @@ def visualize_ei(initial_design, init=None): :return: None """ - # 1. Plot GP fit on initial dataset - # 2. Mark current incumbent - # 3. Mark Zone of Probable Improvement - # 4. Mark Hypothetical Real cost of a random configuration - # 5. Display I(lambda) - # 6. Display Vertical Normal Distribution - - # boplot.set_rcparams(**{'legend.loc': 'lower left'}) - logging.debug("Visualizing EI with initial design {} and init {}".format(initial_design, init)) # Initialize dummy dataset x, y = initialize_dataset(initial_design=initial_design, init=init) @@ -99,227 +90,252 @@ def visualize_ei(initial_design, init=None): logging.debug("Model fit to dataset.\nOriginal Inputs: {0}\nOriginal Observations: {1}\n" "Predicted Means: {2}\nPredicted STDs: {3}".format(x, y, *(gp.predict(x, return_std=True)))) + # -------------------------------------------- + + def draw_basic_figure(obj_func=False): + fig, ax = plt.subplots(1, 1, squeeze=True) + ax.set_xlim(bounds["x"]) + ax.set_ylim(bounds["gp_y"]) + ax.grid() + boplot.plot_gp(model=gp, confidence_intervals=[1.0, 2.0, 3.0], custom_x=x, ax=ax) + if obj_func: + boplot.plot_objective_function(ax=ax) + boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) + boplot.highlight_output( + y=np.array([ymin]), + label=['$c_{inc}$'], + lloc='left', + ax=ax, + # disable_ticks=True + ) + # boplot.annotate_y_edge( + # label='$c_{inc}$', + # xy=((ax.get_xlim()[0] + x[ymin_arg]) / 2, ymin), + # ax=ax, + # align='left', + # yoffset=1.0 + # ) + + return fig, ax + + + def perform_finishing_tasks(ax, filename="", remove_legend=True): + + ax.legend().set_zorder(zorders['annotations_high']) + ax.set_xlabel(labels['xlabel']) + + if remove_legend: + ax.legend().remove() + + plt.tight_layout() + if TOGGLE_PRINT: + plt.savefig(f"{OUTPUT_DIR}/{filename}") + else: + plt.show() + # 1. Plot GP fit on initial dataset # -------------Plotting code ----------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, highlight_datapoint=None, highlight_label=None, ax=ax) - - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("ei_1.pdf") - else: - plt.show() - # ------------------------------------------- + fig, ax = draw_basic_figure(obj_func=True) - # 2. Mark current incumbent - # -------------Plotting code ----------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("ei_2.pdf") - else: - plt.show() + perform_finishing_tasks( + ax=ax, + filename="ei_1.pdf", + remove_legend=False + ) # ------------------------------------------- - # 3. Mark Zone of Probable Improvement + def draw_basic_figure_plus_zone(): + fig, ax = draw_basic_figure(obj_func=False) + boplot.darken_graph(y=ymin, ax=ax) + + return fig, ax + + + # 2a. Mark Zone of Probable Improvement + Display Legend # -------------Plotting code ----------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - boplot.darken_graph(y=ymin, ax=ax) - - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - ax.legend().remove() - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("ei_3.pdf") - else: - plt.show() - # ------------------------------------------- + fig, ax = draw_basic_figure_plus_zone() - # 4. Forget the underlying objective function + perform_finishing_tasks( + ax=ax, + filename="ei_2a.pdf", + remove_legend=False + ) # ------------------------------------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - # boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - boplot.darken_graph(y=ymin, ax=ax) - - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("ei_4.pdf") - else: - plt.show() + # 2b. Mark Zone of Probable Improvement + Remove Legend + # -------------Plotting code ----------------- + fig, ax = draw_basic_figure_plus_zone() + perform_finishing_tasks( + ax=ax, + filename="ei_2b.pdf", + remove_legend=True + ) # ------------------------------------------- - # 5. Mark Hypothetical Real cost of a random configuration + def draw_distribution_for_candidate(ax, candidate, target_cost): + vcurve_x, vcurve_y, mu = boplot.draw_vertical_normal( + gp=gp, incumbenty=ymin, ax=ax, xtest=candidate, + xscale=2.0, yscale=1.0, fill=False, draw_domain=False + ) + + idx = np.where(np.logical_and(vcurve_y > target_cost - 0.1, vcurve_y < target_cost + 0.1)) + ann_y = vcurve_y[idx] + ann_x = vcurve_x[idx] + ax.fill_betweenx(ann_y, candidate, ann_x, alpha=1.0, facecolor='darkgreen', + zorder=zorders['annotations_high'] - 5) + + def draw_final_figure(sample_cost, vis_confs, inc_eq_loc_x, draw_improvement=True, draw_normals=True): + fig, ax = draw_basic_figure_plus_zone() + + labels = [r'$\lambda_%d$' % (idx + 1) for idx in range(len(vis_confs))] + boplot.highlight_configuration( + x=np.array(vis_confs), + label=labels, + lloc='bottom', + ax=ax, + disable_ticks=True + ) + for label, conf in zip(labels, vis_confs): + boplot.annotate_x_edge( + label=label, + xy=(conf + 0.6 * (ax.get_xlim()[1] - ax.get_xlim()[0]) / 10, ymin), + ax=ax, + align='bottom', + offset_param=1.9 + ) + boplot.highlight_output( + y=np.array([sample_cost, ymin]), + label=['c', '$c_{inc}$'], + lloc='left', + ax=ax + ) + # boplot.annotate_y_edge(label=r'c', xy=(lambda, cost), align='left', ax=ax) + + if draw_improvement: + ax.annotate(s='', xy=(inc_eq_loc_x, sample_cost), xytext=(inc_eq_loc_x, ymin), + arrowprops={'arrowstyle': 'simple', }) + + ax.text(inc_eq_loc_x - 1.0, sample_cost - 1.0, r'$I_c=c_{inc}-c$', weight='heavy') + + if draw_normals: + for idx in range(len(vis_confs)): + conf = vis_confs[idx] + draw_distribution_for_candidate(ax=ax, candidate=conf, target_cost=sample_cost) + + ax.annotate( + s=r"$p(c|\lambda_%d)$" % (idx+1), xy=(conf, sample_cost), xytext=(conf-1.8, sample_cost - 1.5), + arrowprops={'arrowstyle': 'fancy', 'shrinkA': 20.0}, + weight='heavy', color='darkgreen', zorder=zorders['annotations_high'] + ) + + + return fig, ax + + # 3. Mark Hypothetical Real cost of a random configuration # ------------------------------------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - # boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - boplot.darken_graph(y=ymin, ax=ax) - - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - candidate = 11.0 - cost = -3.5 - boplot.highlight_configuration(x=np.array([candidate]), label=r'$\lambda$', lloc='bottom', ax=ax) - boplot.highlight_output(y=np.array([cost]), label='', lloc='left', ax=ax) - boplot.annotate_y_edge(label=r'$c(\lambda)$', xy=(candidate, cost), align='left', ax=ax) - - ax.legend().remove() - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("ei_5.pdf") - else: - plt.show() + candidate1 = 4.5 + candidate2 = 11 + + fig, ax = draw_final_figure( + sample_cost=-1.5, + vis_confs=[candidate1], + inc_eq_loc_x=None, + draw_improvement=False, + draw_normals=False + ) + perform_finishing_tasks( + ax=ax, + filename="ei_3.pdf", + remove_legend=True + ) # ------------------------------------------- - # 6. Display I(lambda) + # 4. Display I(lambda) # ------------------------------------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - # boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - boplot.darken_graph(y=ymin, ax=ax) + fig, ax = draw_final_figure( + sample_cost=-1.5, + vis_confs=[candidate1], + inc_eq_loc_x=(candidate1 + candidate2) / 2, + draw_improvement=True, + draw_normals=False + ) - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') + perform_finishing_tasks( + ax=ax, + filename="ei_4.pdf", + remove_legend=True + ) + # ------------------------------------------- - boplot.highlight_configuration(x=np.array([candidate]), label=r'$\lambda$', lloc='bottom', ax=ax) - boplot.highlight_output(y=np.array([cost]), label='', lloc='left', ax=ax) - boplot.annotate_y_edge(label=r'$c(\lambda)$', xy=(candidate, cost), align='left', ax=ax) + # 5. Display Vertical Normal Distribution + # ------------------------------------------- - xmid = (x[ymin_arg][0] + candidate) / 2. - ax.annotate(s='', xy=(xmid, cost), xytext=(xmid, ymin), - arrowprops={'arrowstyle': '<|-|>',}) + fig, ax = draw_final_figure( + sample_cost=-1.5, + vis_confs=[candidate1], + inc_eq_loc_x=(candidate1 + candidate2) / 2 + ) - textx = xmid + (ax.get_xlim()[1] - ax.get_xlim()[0]) / 40 - ax.text(textx, (ymin + cost) / 2, r'$I^{(t)}(\lambda)$', weight='heavy') + perform_finishing_tasks( + ax=ax, + filename="ei_5.pdf", + remove_legend=True + ) + # ------------------------------------------- - ax.legend().remove() + # 6. Display improvement for c_1 with two configurations + # ------------------------------------------- - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("ei_6.pdf") - else: - plt.show() + fig, ax = draw_final_figure( + sample_cost=-1.5, + vis_confs=[candidate1, candidate2], + inc_eq_loc_x=(candidate1 + candidate2) / 2 + ) + perform_finishing_tasks( + ax=ax, + filename="ei_6.pdf", + remove_legend=True + ) # ------------------------------------------- - # 7. Display Vertical Normal Distribution + # 7. Display improvement for c_2 with two configurations # ------------------------------------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - # boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - boplot.darken_graph(y=ymin, ax=ax) - - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - boplot.highlight_configuration(x=np.array([candidate]), label=r'$\lambda$', lloc='bottom', ax=ax) - boplot.highlight_output(y=np.array([cost]), label='', lloc='left', ax=ax) - boplot.annotate_y_edge(label=r'$c(\lambda)$', xy=(candidate, cost), align='left', ax=ax) - - xmid = (x[ymin_arg][0] + candidate) / 2. - ax.annotate(s='', xy=(xmid, cost), xytext=(xmid, ymin), - arrowprops={'arrowstyle': '<|-|>',}) - - textx = xmid + (ax.get_xlim()[1] - ax.get_xlim()[0]) / 40 - ax.text(textx, (ymin + cost) / 2, r'$I^{(t)}(\lambda)$', weight='heavy') - - vcurve_x, vcurve_y, mu = boplot.draw_vertical_normal( - gp=gp, incumbenty=ymin, ax=ax, xtest=candidate, - xscale=2.0, yscale=1.0 + fig, ax = draw_final_figure( + sample_cost=-2.5, + vis_confs=[candidate1, candidate2], + inc_eq_loc_x=(candidate1 + candidate2) / 2 ) - ann_x = candidate + 0.3 * (np.max(vcurve_x) - candidate) / 2 - ann_y = ymin - (mu - ymin) / 2 + perform_finishing_tasks( + ax=ax, + filename="ei_7.pdf", + remove_legend=True + ) + # ------------------------------------------- - arrow_x = ann_x + 0.5 - arrow_y = ann_y - 3.0 - # label = "{:.2f}".format(candidate) - label = '\lambda' + # 8. Display improvement for c_3 with two configurations + # ------------------------------------------- - ax.annotate( - s=r'$PI^{(t)}(%s)$' % label, xy=(ann_x, ann_y), xytext=(arrow_x, arrow_y), - arrowprops={'arrowstyle': 'fancy'}, - weight='heavy', color='darkgreen', zorder=15 + fig, ax = draw_final_figure( + sample_cost=-3.0, + vis_confs=[candidate1, candidate2], + inc_eq_loc_x=(candidate1 + candidate2) / 2 ) - ax.legend().remove() - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("ei_7.pdf") - else: - plt.show() - + perform_finishing_tasks( + ax=ax, + filename="ei_8.pdf", + remove_legend=True + ) # ------------------------------------------- diff --git a/w06_hpo_bo/scripts/es_plots.py b/w06_hpo_bo/scripts/es_plots.py index ef8acf8..bcbb71c 100644 --- a/w06_hpo_bo/scripts/es_plots.py +++ b/w06_hpo_bo/scripts/es_plots.py @@ -1,28 +1,34 @@ import warnings + warnings.filterwarnings('ignore') import argparse import logging - +import os.path import numpy as np from sklearn.neighbors import KernelDensity as kd from sklearn.gaussian_process import GaussianProcessRegressor as GPR from sklearn.gaussian_process.kernels import Matern from matplotlib import pyplot as plt +from matplotlib import ticker as mtick import bo_plot_utils as boplot from bo_configurations import * -SEED = None TOGGLE_PRINT = False INIT_X_PRESENTATION = [2.5, 3.5, 5.5, 7, 9] +OUTPUT_DIR = os.path.abspath("./outputs/es") +GP_SAMPLE_COLOR_SEED = 1256 +GP_SAMPLE_SEED = 65 -labels["xlabel"] = "$\lambda'$" -labels["ylabel"] = "$c(\lambda')$" +labels["xlabel"] = "$\lambda$" +labels["ylabel"] = "" +# labels["ylabel"] = "$c(\lambda')$" bounds["acq_y"] = (0.0, 10.0) bounds["x"] = (2, 13) bounds["gp_y"] = (-5, 5) + def initialize_dataset(initial_design, init=None): """ Initialize some data to start fitting the GP on. @@ -74,17 +80,15 @@ def visualize_es(initial_design, init=None): # 2. Show GP fit on initial dataset, 1 sample, histogram # 3. Show GP fit on initial dataset, 3 samples, histogram # 4. Show GP fit on initial dataset, 50 samples, histogram - # 5. Show PDF derived from the histogram at 50 samples + # 5. Show PDF derived from the histogram at 10e9 samples # 6. Mark maximum of the PDF as next configuration to be evaluated - # a. Plot GP # b. Sample GP, mark minima, update histogram of lambda* # c. Repeat 2 for each sample. # d. Show results after multiple iterations - - boplot.set_rcparams(**{'figure.figsize': (22, 11)}) + boplot.set_rc('figure', figsize=(22, 11)) # Initial setup # ------------------------------------------- @@ -107,34 +111,135 @@ def visualize_es(initial_design, init=None): # ------------------------------------------- - def draw_samples(nsamples, ax1, ax2, show_min=False, return_pdf=False): + def bin_large_sample_size(nsamples, seed, return_pdf=False, batch_size=1280000): + # Used for plotting a histogram when a large number of samples are to be generated. + + logging.info(f"Generating batch-wise histogram data for {nsamples} samples {batch_size} samples at at time.") + + rng = np.random.RandomState(seed=seed) + counts = np.zeros_like(X_.flatten()) + bin_edges = np.zeros(shape=(counts.shape[0]+1)) + # Smoothen out the batches - we don't care about missing out a small overflow number of samples. + nsamples = (nsamples // batch_size) * batch_size + for idx in range(0, nsamples, batch_size): + # # Iterate in increments of batch_size samples, but check for an uneven batch in the last iteration + # batch_nsamples = batch_size if (nsamples - idx) % batch_size == 0 else nsamples - idx + if idx % (batch_size * 10) == 0: + logging.info(f"Generated {idx} samples out of an expected {nsamples}" + f"[{idx * 100.0 / nsamples}%].") + batch_nsamples = batch_size + mu = gp.sample_y(X=X_, n_samples=batch_nsamples, random_state=rng) + minima = X_[np.argmin(mu, axis=0), 0] + hist, bin_edges = np.histogram( + minima, bins=nbins, + range=bin_range, density=return_pdf, + ) + counts += hist + + logging.info(f"Finished generating {nsamples} samples.") + return counts, bin_edges + + + def draw_samples(nsamples, ax1, ax2, show_min=False, return_pdf=False, show_samples=True, show_hist=True, data=None): if not nsamples: - return - seed2 = 1256 - seed3 = 65 - - mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3) - boplot.plot_gp_samples( - mu=mu, - nsamples=nsamples, - precision=histogram_precision, - custom_x=X_, - show_min=show_min, - ax=ax1, - seed=seed2 - ) - data_h = X_[np.argmin(mu, axis=0), 0] - logging.info("Shape of data_h is {}".format(data_h.shape)) - logging.debug("data_h is: {}".format(data_h)) - - bins = ax2.hist( - data_h, bins=nbins, - range=bin_range, density=return_pdf, - color='lightgreen', edgecolor='black', alpha=0.0 if return_pdf else 1.0 - ) + raise RuntimeError(f"Number of samples must be a positive integer, received " + f"{nsamples} of type {type(nsamples)}") + + # If data is not None, assume that it contains pre-computed histogram data + logging.debug("Recieved histogram data of shape %s." % str(np.array(data).shape)) + if data: + logging.debug("Histogram data contained %d counts and %d bins." %(np.array(data[0]).shape[0], np.array(data[1]).shape[0])) + counts = data[0] + bins = data[1] + return ax2.hist( + bins[:-1], + bins=bins, + density=return_pdf, + weights=counts, + color='lightgreen', edgecolor='black', alpha=0.0 if return_pdf else 1.0 + ) + + mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=GP_SAMPLE_SEED) + if show_samples: + boplot.plot_gp_samples( + mu=mu, + nsamples=nsamples, + precision=histogram_precision, + custom_x=X_, + show_min=show_min, + ax=ax1, + seed=GP_SAMPLE_COLOR_SEED + ) + minima = X_[np.argmin(mu, axis=0), 0] + logging.info("Shape of minima is {}".format(minima.shape)) + # logging.debug("minima is: {}".format(minima)) + + bins = None + if show_hist: + bins = ax2.hist( + minima, bins=nbins, + range=bin_range, density=return_pdf, + color='lightgreen', edgecolor='black', alpha=0.0 if return_pdf else 1.0 + ) return bins + + def draw_basic_plot(ax2_sci_not=False): + fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) + ax1.set_xlim(bounds['x']) + ax1.set_ylim(bounds['gp_y']) + ax2.set_xlim(bounds['x']) + ax2.set_ylim(bounds['acq_y']) + + if ax2_sci_not: + f = mtick.ScalarFormatter(useOffset=False, useMathText=True) + g = lambda x,pos : "${}$".format(f._formatSciNotation('%1.10e' % x)) + ax2.yaxis.set_major_formatter(mtick.FuncFormatter(g)) + + ax1.grid() + ax2.grid() + + boplot.plot_objective_function(ax=ax1) + boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) + + return fig, (ax1, ax2) + + + def draw_freq_plots(nsamples): + fig, (ax1, ax2) = draw_basic_plot() + + if nsamples == 1: + show_min = True + else: + show_min = False + + draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2, show_min=show_min) + + return fig, (ax1, ax2) + + + def finishing_touches(ax1, ax2, ax1_title, ax2_title, show_legend=False, figname="es.pdf"): + ax1.set_xlabel(labels['xlabel']) + # ax1.set_ylabel(labels['gp_ylabel']) + # ax1.set_title(ax1_title, loc='left') + + ax2.set_xlabel(labels['xlabel']) + ax2.set_ylabel(r'Frequency') + ax2.set_title(ax2_title, loc='left') + + if show_legend: + ax1.legend().set_zorder(zorders["legend"]) + else: + ax1.legend().remove() + + # plt.tight_layout() + plt.subplots_adjust(hspace=1.0) + if TOGGLE_PRINT: + plt.savefig(f"{OUTPUT_DIR}/{figname}") + else: + plt.show() + # 1. Show GP fit on initial dataset, 0 samples, histogram # ------------------------------------------- @@ -142,20 +247,8 @@ def draw_samples(nsamples, ax1, ax2, show_min=False, return_pdf=False): bounds['acq_y'] = (0.0, 1.0) - fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) - ax1.set_xlim(bounds['x']) - ax1.set_ylim(bounds['gp_y']) - ax2.set_xlim(bounds['x']) - ax2.set_ylim(bounds['acq_y']) - ax1.grid() - ax2.grid() - - boplot.plot_objective_function(ax=ax1) - boplot.plot_gp(model=gp, confidence_intervals=[3.0], ax=ax1, custom_x=x) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) - - nsamples = 0 - draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2, show_min=True) + fig, (ax1, ax2) = draw_basic_plot() + boplot.plot_gp(model=gp, confidence_intervals=[1.0, 2.0, 3.0], ax=ax1, custom_x=x) # Plot uniform prior for p_min xplot = boplot.get_plot_domain() @@ -165,10 +258,10 @@ def draw_samples(nsamples, ax1, ax2, show_min=False, return_pdf=False): ax2.plot(xplot[:, 0], yupper, color='green', linewidth=2.0) ax2.fill_between(xplot[:, 0], ylims[0], yupper, color='lightgreen') - ax1.legend().set_zorder(20) + # ax1.legend().set_zorder(zorders["legend"]) ax1.set_xlabel(labels['xlabel']) - ax1.set_ylabel(labels['gp_ylabel']) - ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left') + # ax1.set_ylabel(labels['gp_ylabel']) + # ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left') ax2.set_xlabel(labels['xlabel']) ax2.set_ylabel(r'$p_{min}$') @@ -176,232 +269,156 @@ def draw_samples(nsamples, ax1, ax2, show_min=False, return_pdf=False): plt.tight_layout() if TOGGLE_PRINT: - plt.savefig('es_1') + plt.savefig(f"{OUTPUT_DIR}/es_1.pdf") else: plt.show() # ------------------------------------------- + # 2. Show GP fit on initial dataset, 1 sample, histogram # ------------------------------------------- + nsamples = 1 bounds['acq_y'] = (0.0, 5.0) + ax1_title = r"One sample$" ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$' + figname = "es_2.pdf" - fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) - ax1.set_xlim(bounds['x']) - ax1.set_ylim(bounds['gp_y']) - ax2.set_xlim(bounds['x']) - ax2.set_ylim(bounds['acq_y']) - ax1.grid() - ax2.grid() - - boplot.plot_objective_function(ax=ax1) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) - - nsamples = 1 - draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2, show_min=True) - - ax1.legend().set_zorder(20) - ax1.set_xlabel(labels['xlabel']) - ax1.set_ylabel(labels['gp_ylabel']) - ax1.set_title(r"One sample from $\mathcal{G}^t$", loc='left') - - ax2.set_xlabel(labels['xlabel']) + fig, (ax1, ax2) = draw_freq_plots(nsamples=nsamples) - # ax2.set_ylabel(r'$p_{min}$') - ax2.set_ylabel(r'Frequency') - ax2.set_title(ax2_title, loc='left') - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig('es_2') - else: - plt.show() + finishing_touches( + ax1=ax1, ax2=ax2, + ax1_title=ax1_title, ax2_title=ax2_title, + show_legend=False, + figname=figname + ) # 3. Show GP fit on initial dataset, 10 samples, histogram # ------------------------------------------- + nsamples = 10 bounds['acq_y'] = (0.0, 10.0) + ax1_title = r"Ten samples$" ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$' + figname = "es_3.pdf" - fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) - ax1.set_xlim(bounds['x']) - ax1.set_ylim(bounds['gp_y']) - ax2.set_xlim(bounds['x']) - ax2.set_ylim(bounds['acq_y']) - ax1.grid() - ax2.grid() - - boplot.plot_objective_function(ax=ax1) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) - - nsamples = 10 - draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2) - - ax1.set_xlabel(labels['xlabel']) - ax1.set_ylabel(labels['gp_ylabel']) - ax1.set_title(r"Ten samples from $\mathcal{G}^t$", loc='left') - - ax2.set_xlabel(labels['xlabel']) - - # ax2.set_ylabel(r'$p_{min}$') - ax2.set_ylabel(r'Frequency') - ax2.set_title(ax2_title, loc='left') + fig, (ax1, ax2) = draw_freq_plots(nsamples=nsamples) - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig('es_3') - else: - plt.show() + finishing_touches( + ax1=ax1, ax2=ax2, + ax1_title=ax1_title, ax2_title=ax2_title, + show_legend=False, + figname=figname + ) # ------------------------------------------- # 4. Show GP fit on initial dataset, 200 samples, histogram # ------------------------------------------- + nsamples = 100 bounds["acq_y"] = (0.0, 20.0) + ax1_title = r"200 samples$" ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$' + figname="es_4.pdf" - fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) - ax1.set_xlim(bounds['x']) - ax1.set_ylim(bounds['gp_y']) - ax2.set_xlim(bounds['x']) - ax2.set_ylim(bounds['acq_y']) - ax1.grid() - ax2.grid() + fig, (ax1, ax2) = draw_freq_plots(nsamples=nsamples) - boplot.plot_objective_function(ax=ax1) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) + finishing_touches( + ax1=ax1, ax2=ax2, + ax1_title=ax1_title, ax2_title=ax2_title, + show_legend=False, + figname=figname + ) - nsamples = 200 - draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2) - - ax1.set_xlabel(labels['xlabel']) - ax1.set_ylabel(labels['gp_ylabel']) - ax1.set_title(r"200 samples from $\mathcal{G}^t$", loc='left') - - ax2.set_xlabel(labels['xlabel']) - - # ax2.set_ylabel(r'$p_{min}$') - ax2.set_ylabel(r'Frequency') - ax2.set_title(ax2_title, loc='left') - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig('es_4') - else: - plt.show() # ------------------------------------------- - # 5. Show PDF derived from the histogram at 200 samples + # 5. Show PDF derived from the histogram at 10e9 samples # ------------------------------------------- - ax2_title = "$\hat{P}(\lambda=\lambda^*)$" - bounds["acq_y"] = (0.0, 1.0) - - fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) - ax1.set_xlim(bounds['x']) - ax1.set_ylim(bounds['gp_y']) - ax2.set_xlim(bounds['x']) - ax2.set_ylim(bounds["acq_y"]) - ax1.grid() - ax2.grid() - - boplot.plot_objective_function(ax=ax1) - boplot.plot_gp(model=gp, confidence_intervals=[3.0], ax=ax1, custom_x=x) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) - - nsamples = 200 - seed3 = 65 - - mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3) - data_h = X_[np.argmin(mu, axis=0), 0] + nsamples = int(1e9) # Generate ~1 Billion samples + bounds["acq_y"] = (0.0, nsamples / 10.0) + # ax1_title = r"200 samples from $\mathcal{G}^t$" + # ax2_title = "$\hat{P}(\lambda=\lambda^*)$" + ax1_title = r"A very large number of samples" + ax2_title = r'Frequency of $\lambda=\hat{\lambda}^*$' + figname = "es_5.pdf" - kde = kd(kernel='gaussian', bandwidth=0.75).fit(data_h.reshape(-1, 1)) - xplot = boplot.get_plot_domain() - ys = np.exp(kde.score_samples(xplot)) + fig, (ax1, ax2) = draw_basic_plot(ax2_sci_not=True) - ax2.plot(xplot, ys, color='green', lw=2.) - ax2.fill_between(xplot[:, 0], ax2.get_ylim()[0], ys, color='lightgreen') + # Draw only a limited number of samples + draw_samples(nsamples=200, ax1=ax1, ax2=ax2, show_min=False, show_samples=True, show_hist=False) - ax1.set_xlabel(labels['xlabel']) - ax1.set_ylabel(labels['gp_ylabel']) - ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left') + # Use an alternate procedure to generate the histogram data + counts, bins = bin_large_sample_size(nsamples, seed=GP_SAMPLE_SEED, return_pdf=False, batch_size=1280000) + hist_data = (counts, bins) - ax2.set_xlabel(labels['xlabel']) - ax2.set_ylabel(r'$p_{min}$') - ax2.set_title(ax2_title, loc='left') + # Draw histogram only for a large number of samples + draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2, show_min=False, + show_samples=False, show_hist=True, data=hist_data) - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig('es_5') - else: - plt.show() + finishing_touches( + ax1=ax1, ax2=ax2, + ax1_title=ax1_title, ax2_title=ax2_title, + show_legend=False, + figname=figname + ) # ------------------------------------------- # 6. Mark maximum of the PDF as next configuration to be evaluated # ------------------------------------------- - ax2_title = "$\hat{P}(\lambda=\lambda^*)$" - bounds["acq_y"] = (0.0, 1.0) - - fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) - ax1.set_xlim(bounds['x']) - ax1.set_ylim(bounds['gp_y']) - ax2.set_xlim(bounds['x']) - ax2.set_ylim(bounds["acq_y"]) - ax1.grid() - ax2.grid() + figname = "es_6.pdf" - boplot.plot_objective_function(ax=ax1) - boplot.plot_gp(model=gp, confidence_intervals=[3.0], ax=ax1, custom_x=x) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax1) + fig, (ax1, ax2) = draw_basic_plot(ax2_sci_not=True) - nsamples = 200 - seed3 = 65 + # Draw only a limited number of samples + draw_samples(nsamples=200, ax1=ax1, ax2=ax2, show_min=False, show_samples=True, show_hist=False) - mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3) - data_h = X_[np.argmin(mu, axis=0), 0] + # Draw histogram only for a large number of samples using previously generated histogram data + draw_samples(nsamples=nsamples, ax1=ax1, ax2=ax2, + show_min=False, show_samples=False, show_hist=True, data=hist_data) - kde = kd(kernel='gaussian', bandwidth=0.75).fit(data_h.reshape(-1, 1)) xplot = boplot.get_plot_domain() - ys = np.exp(kde.score_samples(xplot)) - idx_umax = np.argmax(ys) - boplot.highlight_configuration(x=xplot[idx_umax], label='', ax=ax1, disable_ticks=True) - boplot.annotate_x_edge(label=r'$\lambda^{(t)}$', xy=(xplot[idx_umax], ax1.get_ylim()[0]), - ax=ax1, align='top', offset_param= 1.5) - boplot.highlight_configuration(x=xplot[idx_umax], label='', ax=ax2, disable_ticks=True) - boplot.annotate_x_edge(label=r'$\lambda^{(t)}$', xy=(xplot[idx_umax], ys[idx_umax]), - ax=ax2, align='top', offset_param=1.0) - - ax2.plot(xplot, ys, color='green', lw=2.) - ax2.fill_between(xplot[:, 0], ax2.get_ylim()[0], ys, color='lightgreen') - - ax1.set_xlabel(labels['xlabel']) - ax1.set_ylabel(labels['gp_ylabel']) - ax1.set_title(r"Visualization of $\mathcal{G}^t$", loc='left') - - ax2.set_xlabel(labels['xlabel']) - ax2.set_ylabel(r'$p_{min}$') - ax2.set_title(ax2_title, loc='left') - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig('es_6') - else: - plt.show() + idx_umax = np.argmax(counts) + xmax = (bins[idx_umax] + bins[idx_umax + 1]) / 2.0 + logging.info(f"Highlighting xmax as configuration at index {idx_umax} with count {counts[idx_umax]}, " + f"at configuration {xmax}.") + boplot.highlight_configuration(x=xmax, label=r'$\lambda^{(t)}$', ax=ax1, disable_ticks=False) + # boplot.annotate_x_edge(label=r'$\lambda^{(t)}$', xy=(xplot[idx_umax], ax1.get_ylim()[0]), + # ax=ax1, align='top', offset_param=1.5) + boplot.highlight_configuration(x=xmax, label=r'$\lambda^{(t)}$', ax=ax2, disable_ticks=False) + # boplot.annotate_x_edge(label=r'$\lambda^{(t)}$', xy=(xplot[idx_umax], ys[idx_umax]), + # ax=ax2, align='top', offset_param=1.0) + + finishing_touches( + ax1=ax1, ax2=ax2, + ax1_title=ax1_title, ax2_title=ax2_title, + show_legend=False, + figname=figname + ) + + # nsamples = 200 + # seed3 = 65 + # + # mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3) + # data_h = X_[np.argmin(mu, axis=0), 0] + # + # kde = kd(kernel='gaussian', bandwidth=0.75).fit(data_h.reshape(-1, 1)) + # + # ax2.plot(xplot, ys, color='green', lw=2.) + # ax2.fill_between(xplot[:, 0], ax2.get_ylim()[0], ys, color='lightgreen') # ------------------------------------------- def main(init_size, initial_design): - visualize_es( - init=init_size, - initial_design=initial_design, - ) - + visualize_es( + init=init_size, + initial_design=initial_design, + ) if __name__ == '__main__': @@ -449,9 +466,9 @@ def main(init_size, initial_design): else: boplot.enable_onscreen_display() - #init_size = max(1, int(args.num_func_evals * args.fraction_init)) + # init_size = max(1, int(args.num_func_evals * args.fraction_init)) main( init_size=args.init_db_size, initial_design=args.initial_design, - ) \ No newline at end of file + ) diff --git a/w06_hpo_bo/scripts/lcb_plots.py b/w06_hpo_bo/scripts/lcb_plots.py index 6cddfa3..ab4609a 100644 --- a/w06_hpo_bo/scripts/lcb_plots.py +++ b/w06_hpo_bo/scripts/lcb_plots.py @@ -2,8 +2,7 @@ warnings.filterwarnings('ignore') import argparse import logging -from functools import partial - +import os.path import numpy as np from sklearn.gaussian_process import GaussianProcessRegressor as GPR from sklearn.gaussian_process.kernels import Matern @@ -17,12 +16,13 @@ SEED = None TOGGLE_PRINT = False INIT_X_PRESENTATION = [2.5, 3.5, 5.5, 7, 9] +OUTPUT_DIR = os.path.abspath("./outputs/lcb") bounds["x"] = (2, 13) bounds["gp_y"] = (-5, 5) # boplot.set_rcparams(**{"legend.loc": "lower left"}) -labels["xlabel"] = "$\lambda'$" -labels["gp_ylabel"] = "$c(\lambda')$" +labels["xlabel"] = "$\lambda$" +labels["gp_ylabel"] = "" def initialize_dataset(initial_design, init=None): """ @@ -104,14 +104,12 @@ def visualize_lcb(initial_design, init=None): boplot.plot_objective_function(ax=ax) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, highlight_datapoint=None, highlight_label=None, ax=ax) - ax.legend().set_zorder(20) + ax.legend().set_zorder(zorders['legend']) ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') plt.tight_layout() if TOGGLE_PRINT: - plt.savefig("lcb_1.pdf") + plt.savefig(f"{OUTPUT_DIR}/lcb_1.pdf") else: plt.show() # ------------------------------------------- @@ -130,85 +128,81 @@ def visualize_lcb(initial_design, init=None): boplot.plot_objective_function(ax=ax) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - ax.legend().set_zorder(20) + ax.legend().set_zorder(zorders['legend']) ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("lcb_2.pdf") - else: - plt.show() - # ------------------------------------------- - - # 3. Show LCB in parallel - # -------------Plotting code ----------------- - - if TOGGLE_PRINT: - fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True, figsize=(18, 9)) - else: - fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) - - ax1.set_xlim(bounds["x"]) - ax1.set_ylim(bounds["gp_y"]) - ax1.grid() - boplot.plot_gp(model=gp, confidence_intervals=[kappa], type='lower', custom_x=x, ax=ax1) - boplot.plot_objective_function(ax=ax1) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax1) - - lcb_max = get_lcb_maximum(gp, kappa) - logging.info("LCB Maximum at:{}".format(lcb_max)) - boplot.highlight_configuration(x=lcb_max[0], label=None, lloc='bottom', ax=ax1) - ax1.set_xlabel(labels['xlabel']) - ax1.set_ylabel(labels['gp_ylabel']) - ax1.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - ax2.set_xlim(bounds["x"]) - ax2.set_ylim(bounds["acq_y"]) - ax2.grid() - ax2.set_xlabel(labels['xlabel']) - ax2.set_ylabel(labels['acq_ylabel']) - ax2.set_title(r"Visualization of $LCB$", loc='left') - - boplot.highlight_configuration(x=lcb_max[0], label=None, lloc='bottom', ax=ax2) - boplot.plot_acquisition_function(acquisition_functions['LCB'], 0.0, gp, kappa, ax=ax2) plt.tight_layout() if TOGGLE_PRINT: - plt.savefig("lcb_3.pdf") + plt.savefig(f"{OUTPUT_DIR}/lcb_2.pdf") else: plt.show() # ------------------------------------------- - # 4. Mark next sample - # -------------Plotting code ----------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[3.0], type='lower', custom_x=x, ax=ax) - boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - - lcb_max = get_lcb_maximum(gp, 3.0) - logging.info("LCB Maximum at:{}".format(lcb_max)) - boplot.highlight_configuration(x=lcb_max[0], label=None, lloc='bottom', ax=ax) - boplot.highlight_output(y=lcb_max[1], label='', lloc='left', ax=ax) - boplot.annotate_y_edge(label=r'${\hat{c}}^{(t)}(%.2f)$' % lcb_max[0], xy=lcb_max, align='left', ax=ax) - - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - ax.legend().remove() - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("lcb_4.pdf") - else: - plt.show() + # # 3. Show LCB in parallel + # # -------------Plotting code ----------------- + # + # if TOGGLE_PRINT: + # fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True, figsize=(18, 9)) + # else: + # fig, (ax1, ax2) = plt.subplots(2, 1, squeeze=True) + # + # ax1.set_xlim(bounds["x"]) + # ax1.set_ylim(bounds["gp_y"]) + # ax1.grid() + # boplot.plot_gp(model=gp, confidence_intervals=[kappa], type='lower', custom_x=x, ax=ax1) + # boplot.plot_objective_function(ax=ax1) + # boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax1) + # + # lcb_max = get_lcb_maximum(gp, kappa) + # logging.info("LCB Maximum at:{}".format(lcb_max)) + # boplot.highlight_configuration(x=lcb_max[0], label=None, lloc='bottom', ax=ax1) + # ax1.set_xlabel(labels['xlabel']) + # + # ax2.set_xlim(bounds["x"]) + # ax2.set_ylim(bounds["acq_y"]) + # ax2.grid() + # ax2.set_xlabel(labels['xlabel']) + # ax2.set_ylabel(labels['acq_ylabel']) + # ax2.set_title(r"Visualization of $LCB$", loc='left') + # + # boplot.highlight_configuration(x=lcb_max[0], label=None, lloc='bottom', ax=ax2) + # boplot.plot_acquisition_function(acquisition_functions['LCB'], 0.0, gp, kappa, ax=ax2) + # + # plt.tight_layout() + # if TOGGLE_PRINT: + # plt.savefig(f"{OUTPUT_DIR}/lcb_3.pdf") + # else: + # plt.show() + # # ------------------------------------------- + # + # # 4. Mark next sample + # # -------------Plotting code ----------------- + # fig, ax = plt.subplots(1, 1, squeeze=True) + # ax.set_xlim(bounds["x"]) + # ax.set_ylim(bounds["gp_y"]) + # ax.grid() + # boplot.plot_gp(model=gp, confidence_intervals=[3.0], type='lower', custom_x=x, ax=ax) + # boplot.plot_objective_function(ax=ax) + # boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) + # + # lcb_max = get_lcb_maximum(gp, 3.0) + # logging.info("LCB Maximum at:{}".format(lcb_max)) + # boplot.highlight_configuration(x=lcb_max[0], label=None, lloc='bottom', ax=ax) + # boplot.highlight_output(y=lcb_max[1], label='', lloc='left', ax=ax) + # boplot.annotate_y_edge(label=r'${\hat{c}}^{(t)}(%.2f)$' % lcb_max[0], xy=lcb_max, align='left', ax=ax) + # + # ax.legend().set_zorder(zorders['legend']) + # ax.set_xlabel(labels['xlabel']) + # ax.set_ylabel(labels['gp_ylabel']) + # ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') + # + # ax.legend().remove() + # + # plt.tight_layout() + # if TOGGLE_PRINT: + # plt.savefig(f"{OUTPUT_DIR}/lcb_4.pdf") + # else: + # plt.show() # ------------------------------------------- diff --git a/w06_hpo_bo/scripts/look_ahead_kg_plots.py b/w06_hpo_bo/scripts/look_ahead_kg_plots.py index 90aa617..a4842f0 100644 --- a/w06_hpo_bo/scripts/look_ahead_kg_plots.py +++ b/w06_hpo_bo/scripts/look_ahead_kg_plots.py @@ -2,10 +2,7 @@ warnings.filterwarnings('ignore') import argparse import logging -from functools import partial - -import numpy as np -from scipy.optimize import minimize +import os.path from sklearn.gaussian_process import GaussianProcessRegressor as GPR from sklearn.gaussian_process.kernels import Matern @@ -18,9 +15,11 @@ SEED = None TOGGLE_PRINT = False INIT_X_PRESENTATION = [2.5, 4, 6, 7, 8] +OUTPUT_DIR = os.path.abspath("./outputs/lookahead_kg") -labels["xlabel"] = "$\lambda'$" -labels["gp_ylabel"] = "$c(\lambda')$" +labels["xlabel"] = "$\lambda$" +# colors['highlighted_observations'] = 'red' +# labels["gp_ylabel"] = "$c(\lambda')$" def initialize_dataset(initial_design, init=None): """ @@ -88,8 +87,6 @@ def visualize_look_ahead(initial_design, init=None): "Predicted Means: {2}\nPredicted STDs: {3}".format(x, y, *(gp.predict(x, return_std=True)))) # Assume next evaluation location - # x_ = np.mean(x, keepdims=True) - #x_ = np.array([[5.8]]) x_ = np.array([[5.0]]) print(x_) y_ = f(x_[0]) @@ -108,136 +105,104 @@ def visualize_look_ahead(initial_design, init=None): logging.info("Mu-star at time t+1: {}".format(mu_star_t1_xy)) # -------------------------Plotting madness begins--------------------------- - # Draw Figure 1. - # fig.tight_layout() - labels['gp_mean'] = r'Mean - $\mu^{(t)}(\cdot)$' - # labels['incumbent'] = r'Incumbent - ${(\mu^*)}^t$' + def draw_basic_figure(tgp=gp, tx=x, tX_=x, tY_=y, title='', highlight_datapoint=None, highlight_label="", ax=None): + if ax is None: + fig, ax = plt.subplots(1, 1, squeeze=True) + plt.subplots_adjust(0.05, 0.15, 0.95, 0.85) + figflag = True + else: + figflag = False - def draw_figure_1(ax): ax.set_xlim(bounds["x"]) ax.set_ylim(bounds["gp_y"]) + if title: + ax.set_title(title, loc='left') ax.grid() boplot.plot_objective_function(ax=ax) - boplot.plot_gp(model=gp, confidence_intervals=[1.0], ax=ax, custom_x=x) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, ax=ax) + boplot.plot_gp(model=tgp, confidence_intervals=[1.0, 2.0, 3.0], ax=ax, custom_x=tx) + if highlight_datapoint: + boplot.mark_observations(X_=tX_, Y_=tY_, mark_incumbent=False, ax=ax, + highlight_datapoint=highlight_datapoint, highlight_label=highlight_label) + else: + boplot.mark_observations(X_=tX_, Y_=tY_, mark_incumbent=False, ax=ax) + + if figflag: + return fig, ax + else: + return ax + - ax.legend() + def perform_finishing_tasks(ax, filename="", remove_legend=True): + ax.legend().set_zorder(zorders['legend']) ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - if TOGGLE_PRINT: - fig, ax = plt.subplots(1, 1, squeeze=True) - draw_figure_1(ax) - plt.tight_layout() - plt.savefig("look_ahead_1.pdf") + if remove_legend: + ax.legend().remove() - fig, ax = plt.subplots(1, 1, squeeze=True) - draw_figure_1(ax) - boplot.highlight_configuration(mu_star_t_xy[0], lloc='bottom', ax=ax) - boplot.highlight_output(mu_star_t_xy[1], label='', lloc='right', ax=ax, fontsize=30) - boplot.annotate_y_edge(label=r'${(\mu^*)}^{(t)}$', xy=mu_star_t_xy, align='right', ax=ax) - ax.legend().remove() + # plt.tight_layout() + if TOGGLE_PRINT: + # plt.savefig(f"{OUTPUT_DIR}/{filename}", bbox_inches='tight') + plt.savefig(f"{OUTPUT_DIR}/{filename}") + else: + plt.show() - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("look_ahead_KG_2.pdf") - else: - plt.show() + # --------------------------------------- + # Draw look ahead 1. + + labels['gp_mean'] = r'Mean: $\mu^{(t)}(\cdot)$' + + fig, ax = draw_basic_figure(title="") + perform_finishing_tasks(ax=ax, filename="look_ahead_1.pdf", remove_legend=False) - # End of figure 1. # --------------------------------------- - # Draw figure 1-2 transition animation A + # Draw look ahead 2 + fig, ax = draw_basic_figure(title="") - fig, ax = plt.subplots(1, 1, squeeze=True) - draw_figure_1(ax) logging.debug("Placing vertical on configuration: {}".format(x_)) - boplot.highlight_configuration(x=x_, label='', lloc='bottom', ax=ax, ha='center') - boplot.annotate_x_edge(label=r'$\lambda$',xy=(x_, y_), align='bottom', ax=ax) + # boplot.highlight_configuration(x=x_, label='', lloc='bottom', ax=ax, ha='center') + boplot.highlight_configuration(x=x_, label='', lloc='bottom', ax=ax, disable_ticks=True) + boplot.annotate_x_edge(label=r'$\lambda$', xy=(x_, y_), align='bottom', ax=ax) - plt.tight_layout() - plt.legend().remove() - if TOGGLE_PRINT: - plt.savefig("look_ahead_1a.pdf") - else: - plt.show() + perform_finishing_tasks(ax=ax, filename="look_ahead_2.pdf", remove_legend=True) - # End of figure 1-2 transition animation A # --------------------------------------- - # Draw figure 1-2 transition animation B + # Draw look ahead 3 + + fig, ax = draw_basic_figure(title="") - fig, ax = plt.subplots(1, 1, squeeze=True) - draw_figure_1(ax) - logging.debug("Placing vertical on configuration: {}".format(x_)) boplot.highlight_configuration(x=x_, label='', lloc='bottom', ax=ax, ha='right') - boplot.annotate_x_edge(label=r'$\lambda$',xy=(x_, y_), align='bottom', ax=ax) + boplot.annotate_x_edge(label=r'$\lambda$', xy=(x_, y_), align='bottom', ax=ax) + boplot.highlight_output(y_, label='', lloc='right', ax=ax, fontsize=28) boplot.annotate_y_edge(label=r'$c(\lambda)$', xy=(x_, y_), align='right', ax=ax) + ax.scatter( x_, y_, color=colors['highlighted_observations'], marker='X', label=r"Hypothetical Observation $<\lambda, c(\lambda)>$", - zorder=11 + zorder=zorders['annotations_normal'] ) - plt.tight_layout() - plt.legend().remove() - if TOGGLE_PRINT: - plt.savefig("look_ahead_1b.pdf") - else: - plt.show() + perform_finishing_tasks(ax=ax, filename="look_ahead_3.pdf", remove_legend=True) - # End of figure 1-2 transition animation B # --------------------------------------- - # Draw Figure 2. + # Draw look ahead 4. - labels['gp_mean'] = r'Mean - $\mu^{t+1}(\cdot)|_\lambda$' - # labels['incumbent'] = r'Incumbent - ${(\mu^*)}^{t+1}|_\lambda$' + labels['gp_mean'] = r'Mean: $\mu^{(t+1)}(\cdot)|_\lambda$' - def draw_figure_2(ax): - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_objective_function(ax=ax) - boplot.plot_gp(model=gp2, confidence_intervals=[1.0], ax=ax, custom_x=X2_) - boplot.mark_observations(X_=X2_, Y_=Y2_, highlight_datapoint=np.where(np.isclose(X2_, x_))[0], - mark_incumbent=False, - highlight_label=r"Hypothetical Observation $<\lambda, c(\lambda)>$", ax=ax) + fig, ax = draw_basic_figure(tgp=gp2, tx=x, tX_=X2_, tY_=Y2_, title='', + highlight_datapoint=np.where(np.isclose(X2_, x_))[0], + highlight_label=r"Hypothetical Observation $<\lambda, c(\lambda)>$") - ax.legend() - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t+1)}|_\lambda$", loc='left') + perform_finishing_tasks(ax=ax, filename="look_ahead_4.pdf", remove_legend=False) - if TOGGLE_PRINT: - fig, ax = plt.subplots(1, 1, squeeze=True) - draw_figure_2(ax) - plt.tight_layout() - plt.savefig("look_ahead_3.pdf") - - fig, ax = plt.subplots(1, 1, squeeze=True) - draw_figure_2(ax) - boplot.highlight_configuration(mu_star_t1_xy[0], lloc='bottom', ax=ax, ha='right') - boplot.highlight_output(mu_star_t1_xy[1], label='', lloc='right', ax=ax, fontsize=28) - boplot.annotate_y_edge(label=r'${(\mu^*)}^{(t+1)}|_\lambda$', xy=mu_star_t1_xy, align='right', ax=ax) - ax.legend().remove() - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("look_ahead_KG_4.pdf") - else: - plt.show() - - - # End of figure 2. # --------------------------------------- - # Draw figure 2-3 transition animation A - # Vertical comparison of look-ahead at any given x - def draw_2_3_transition(imaginary_lambda, ax1, ax2): + + def draw_vertical_comparison(imaginary_lambda, ax1, ax2): tx_ = np.array([[imaginary_lambda]]) ty_ = f(tx_[0]) @@ -253,7 +218,9 @@ def draw_2_3_transition(imaginary_lambda, ax1, ax2): tgp.fit(tX_, tY_) # fit the model tmu_star_t1_xy = get_mu_star(tgp) - draw_figure_1(ax1) + + # Draw the left hand figure using the old gp on ax1 + draw_basic_figure(tgp=gp, title=r"$\hat{c}^{(t)}$", ax=ax1) logging.debug("Placing vertical on configuration: {}".format(tx_)) @@ -263,100 +230,110 @@ def draw_2_3_transition(imaginary_lambda, ax1, ax2): color=colors['highlighted_observations'], marker='X', label=r"Hypothetical Observation $<\lambda, c(\lambda)>$", - zorder=11 + zorder=zorders["annotations_normal"] ) ax1.legend().remove() - ax2.set_xlim(bounds["x"]) - ax2.set_ylim(bounds["gp_y"]) - ax2.grid() - boplot.plot_objective_function(ax=ax2) - boplot.plot_gp(model=tgp, confidence_intervals=[1.0], ax=ax2, custom_x=tX_) - boplot.mark_observations(X_=tX_, Y_=tY_, highlight_datapoint=np.where(np.isclose(tX_, tx_))[0], - mark_incumbent=False, - highlight_label=r"Hypothetical Observation $<\lambda, c(\lambda)>$", ax=ax2) + # Draw the right hand figure using the hypothetical gp tgp on ax2 + draw_basic_figure(tgp=tgp, tx=tX_, tX_=tX_, tY_=tY_, title=r"$\hat{c}^{(t+1)}|_\lambda$", + highlight_datapoint=np.where(np.isclose(tX_, tx_))[0], + highlight_label=r"Hypothetical Observation $<\lambda, c(\lambda)>$", ax=ax2) - ax2.legend() + + def finishing_touches_parallel(ax1, ax2, filename=""): + ax1.set_xlabel(labels['xlabel']) ax2.set_xlabel(labels['xlabel']) - ax2.set_ylabel(labels['gp_ylabel']) - ax2.set_title(r"Visualization of $\mathcal{G}^{(t+1)}|_\lambda$", loc='left') - ax2.legend().remove() + plt.tight_layout() + if TOGGLE_PRINT: + plt.savefig(f"{OUTPUT_DIR}/{filename}") + else: + plt.show() - # Actual 2-3 transition A - if TOGGLE_PRINT: - fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True, figsize=(18,9)) - else: - fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True) - draw_2_3_transition(imaginary_lambda=5.0, ax1=ax1, ax2=ax2) - plt.tight_layout() + # --------------------------------------- + # Draw look ahead 5 - if TOGGLE_PRINT: - plt.savefig("look_ahead_3a.pdf") - else: - plt.show() + fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True, figsize=(22, 9)) + draw_vertical_comparison(imaginary_lambda=5.0, ax1=ax1, ax2=ax2) + finishing_touches_parallel(ax1=ax1, ax2=ax2, filename="look_ahead_5.pdf") - # End of figure 2-3 transition animation A # --------------------------------------- - # Draw figure 2-3 transition animation B + # Draw look ahead 6 - if TOGGLE_PRINT: - fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True, figsize=(18,9)) - else: - fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True) + fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True, figsize=(22, 9)) + draw_vertical_comparison(imaginary_lambda=5.5, ax1=ax1, ax2=ax2) + finishing_touches_parallel(ax1=ax1, ax2=ax2, filename="look_ahead_6.pdf") - draw_2_3_transition(imaginary_lambda=5.5, ax1=ax1, ax2=ax2) - plt.tight_layout() + # --------------------------------------- + # Draw look ahead 5 + + fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True, figsize=(22, 9)) + draw_vertical_comparison(imaginary_lambda=3.5, ax1=ax1, ax2=ax2) + finishing_touches_parallel(ax1=ax1, ax2=ax2, filename="look_ahead_7.pdf") - if TOGGLE_PRINT: - plt.savefig("look_ahead_3b.pdf") - else: - plt.show() - # End of figure 2-3 transition animation B # --------------------------------------- - # Draw figure 2-3 transition animation C + # Draw KG 1 - if TOGGLE_PRINT: - fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True, figsize=(18,9)) - else: - fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True) + labels['gp_mean'] = r'Mean: $\mu^{(t)}(\cdot)$' - draw_2_3_transition(imaginary_lambda=3.5, ax1=ax1, ax2=ax2) + fig, ax = draw_basic_figure(title="") + perform_finishing_tasks(ax=ax, filename="kg_1.pdf", remove_legend=False) - plt.tight_layout() + # --------------------------------------- + # Draw kg 2 + fig, ax = draw_basic_figure(title="") - if TOGGLE_PRINT: - plt.savefig("look_ahead_3c.pdf") - else: - plt.show() + boplot.highlight_configuration(mu_star_t_xy[0], lloc='bottom', ax=ax, disable_ticks=True) + boplot.annotate_x_edge(label="%.2f" % mu_star_t_xy[0], xy=mu_star_t_xy, ax=ax, align='bottom',offset_param=1.5) + boplot.highlight_output(mu_star_t_xy[1], label='', lloc='right', ax=ax, fontsize=30, disable_ticks=True) + boplot.annotate_y_edge(label=r'${(\mu^*)}^{(t)}$', xy=mu_star_t_xy, align='right', ax=ax, yoffset=1.5) + + perform_finishing_tasks(ax=ax, filename="kg_2.pdf", remove_legend=True) + + # --------------------------------------- + # Draw kg 3 + + fig, ax = draw_basic_figure(tgp=gp2, tx=x, tX_=X2_, tY_=Y2_, title='', + highlight_datapoint=np.where(np.isclose(X2_, x_))[0], + highlight_label=r"Hypothetical Observation $<\lambda, c(\lambda)>$") + + perform_finishing_tasks(ax=ax, filename="kg_3.pdf", remove_legend=True) - # End of figure 2-3 transition animation C # --------------------------------------- + # Draw kg 4 + + + fig, ax = draw_basic_figure(tgp=gp2, tx=x, tX_=X2_, tY_=Y2_, title='', + highlight_datapoint=np.where(np.isclose(X2_, x_))[0], + highlight_label=r"Hypothetical Observation $<\lambda, c(\lambda)>$") + + boplot.highlight_configuration(mu_star_t1_xy[0], lloc='bottom', ax=ax, disable_ticks=True) + boplot.annotate_x_edge(label="%.2f" % mu_star_t1_xy[0], xy=mu_star_t1_xy, ax=ax, align='bottom', offset_param=1.5) + + boplot.highlight_output(mu_star_t1_xy[1], label='', lloc='right', ax=ax, fontsize=28) + boplot.annotate_y_edge(label=r'${(\mu^*)}^{(t+1)}|_\lambda$', xy=mu_star_t1_xy, align='right', ax=ax, yoffset=1.5) + + perform_finishing_tasks(ax=ax, filename="kg_4.pdf", remove_legend=True) + + # --------------------------------------- + # Draw kg 5 + + fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True, figsize=(22, 9)) + draw_vertical_comparison(imaginary_lambda=x_.squeeze(), ax1=ax1, ax2=ax2) - # Draw Figure 3 for KG - fig, (ax1, ax2) = plt.subplots(1, 2, squeeze=True) - # fig.tight_layout() - labels['gp_mean'] = r'Mean - $\mu^(t)(\cdot)$' - draw_figure_1(ax1) boplot.highlight_output(mu_star_t_xy[1], label='', lloc='right', ax=ax1, fontsize=30) - boplot.annotate_y_edge(label='${(\mu^*)}^{(t)}$', xy=mu_star_t_xy, align='right', ax=ax1) - ax1.get_legend().remove() - labels['gp_mean'] = r'Mean - $\mu^{(t+1)}(\cdot)|_\lambda$' - draw_figure_2(ax2) + boplot.annotate_y_edge(label='${(\mu^*)}^{(t)}$', xy=mu_star_t_xy, align='right', ax=ax1, yoffset=1.5) + boplot.highlight_output(mu_star_t1_xy[1], label='', lloc='right', ax=ax2, fontsize=28) - boplot.annotate_y_edge(label='${(\mu^*)}^{(t+1)}|_\lambda$', xy=mu_star_t1_xy, align='left', ax=ax2) - ax2.get_legend().remove() + boplot.annotate_y_edge(label='${(\mu^*)}^{(t+1)}|_\lambda$', xy=mu_star_t1_xy, align='left', ax=ax2, yoffset=1.5) - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("look_ahead_KG_5.pdf") - else: - plt.show() + finishing_touches_parallel(ax1=ax1, ax2=ax2, filename="kg_5.pdf") + return def main(init_size, initial_design): diff --git a/w06_hpo_bo/scripts/pi_plots.py b/w06_hpo_bo/scripts/pi_plots.py index d932f95..80549ed 100644 --- a/w06_hpo_bo/scripts/pi_plots.py +++ b/w06_hpo_bo/scripts/pi_plots.py @@ -2,10 +2,7 @@ warnings.filterwarnings('ignore') import argparse import logging -from functools import partial - -import numpy as np -from scipy.optimize import minimize +import os.path from sklearn.gaussian_process import GaussianProcessRegressor as GPR from sklearn.gaussian_process.kernels import Matern @@ -18,12 +15,15 @@ SEED = None TOGGLE_PRINT = False INIT_X_PRESENTATION = [2.5, 3.5, 5.5, 7, 9] +OUTPUT_DIR = os.path.abspath("./outputs/pi") bounds["x"] = (2, 13) bounds["gp_y"] = (-5, 5) -# boplot.set_rcparams(**{"legend.loc": "lower left"}) -labels["xlabel"] = "$\lambda'$" -labels["gp_ylabel"] = "$c(\lambda')$" +boplot.set_rc("savefig", directory=OUTPUT_DIR) + +labels["xlabel"] = "$\lambda$" +labels["gp_ylabel"] = "" + def initialize_dataset(initial_design, init=None): """ @@ -78,7 +78,6 @@ def visualize_pi(initial_design, init=None): # 4. Draw Vertical Normal at a good candidate for improvement # 5. Draw Vertical Normal at a bad candidate for improvement - # boplot.set_rcparams(**{'legend.loc': 'lower left'}) logging.debug("Visualizing PI with initial design {} and init {}".format(initial_design, init)) # Initialize dummy dataset @@ -98,179 +97,150 @@ def visualize_pi(initial_design, init=None): logging.debug("Model fit to dataset.\nOriginal Inputs: {0}\nOriginal Observations: {1}\n" "Predicted Means: {2}\nPredicted STDs: {3}".format(x, y, *(gp.predict(x, return_std=True)))) + def draw_basic_plot(mark_incumbent=True, show_objective=False): + fig, ax = plt.subplots(1, 1, squeeze=True) + ax.set_xlim(bounds["x"]) + ax.set_ylim(bounds["gp_y"]) + ax.grid() + boplot.plot_gp(model=gp, confidence_intervals=[1.0, 2.0, 3.0], custom_x=x, ax=ax) + if show_objective: + boplot.plot_objective_function(ax=ax) + boplot.mark_observations(X_=x, Y_=y, mark_incumbent=mark_incumbent, + highlight_datapoint=None, highlight_label=None, ax=ax) + + if mark_incumbent: + boplot.highlight_output( + y=np.array([ymin]), + label=['$c_{inc}$'], + lloc='left', + ax=ax, + disable_ticks=True + ) + boplot.annotate_y_edge( + label='$c_{inc}$', + xy=(x[ymin_arg], ymin), + ax=ax, + align='left', + yoffset=0.8 + ) + + return fig, ax + + + def finishing_touches(ax, show_legend=True, figname='pi.pdf'): + ax.set_xlabel(labels['xlabel']) + # ax.set_ylabel(labels['gp_ylabel']) + # ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') + + if show_legend: + ax.legend().set_zorder(zorders['legend']) + else: + ax.legend().remove() + + plt.tight_layout() + if TOGGLE_PRINT: + plt.savefig(f"{OUTPUT_DIR}/{figname}") + else: + plt.show() + # 1. Plot GP fit on initial dataset # -------------Plotting code ----------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, highlight_datapoint=None, highlight_label=None, ax=ax) - - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("pi_1.pdf") - else: - plt.show() - # ------------------------------------------- - # 2. Mark current incumbent - # -------------Plotting code ----------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("pi_2.pdf") - else: - plt.show() - # ------------------------------------------- + fig, ax = draw_basic_plot(mark_incumbent=False, show_objective=True) - # 3. Mark Zone of Probable Improvement - # -------------Plotting code ----------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - boplot.darken_graph(y=ymin, ax=ax) - - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') - - ax.legend().remove() - - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("pi_3.pdf") - else: - plt.show() - # ------------------------------------------- + finishing_touches(ax, show_legend=True, figname="pi_1.pdf") - # 4. Draw Vertical Normal at a good candidate for improvement + # ------------------------------------------- + # 2. Mark current incumbent # -------------Plotting code ----------------- - candidate = 5.0 - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - boplot.darken_graph(y=ymin, ax=ax) - - vcurve_x, vcurve_y, mu = boplot.draw_vertical_normal(gp=gp, incumbenty=ymin, ax=ax, xtest=candidate, xscale=2.0, yscale=1.0) - - ann_x = candidate + 0.5 * (np.max(vcurve_x) - candidate) / 2 - ann_y = mu - 0.25 - - arrow_x = ann_x - arrow_y = ann_y - 3.0 - - label = "{:.2f}".format(candidate) - - ax.annotate( - s=r'$PI^{(t)}(%s)$' % label, xy=(ann_x, ann_y), xytext=(arrow_x, arrow_y), - arrowprops={'arrowstyle': 'fancy'}, - weight='heavy', color='darkgreen', zorder=15 - ) - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') + fig, ax = draw_basic_plot(mark_incumbent=True, show_objective=True) - ax.legend().remove() + finishing_touches(ax, show_legend=True, figname="pi_2.pdf") - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("pi_4.pdf") - else: - plt.show() # ------------------------------------------- - # 5. Draw Vertical Normal at a bad candidate for improvement + def draw_final_graph(show_objective=False, show_vertical_normals=True, candidates=None, normal_labels=None): + if candidates is None and show_vertical_normals: + raise RuntimeError("In order to show vertical normal distributions, candidates at which the PDF is sampled " + "must be specified as a list of floats.") + + fig, ax = draw_basic_plot(show_objective=show_objective) + boplot.darken_graph(y=ymin, ax=ax) + + if show_vertical_normals: + if normal_labels is None: + normal_labels = [r"$P(\lambda_{%d})$" % (i + 1) for i in range(len(candidates))] + elif type(normal_labels) is str: + normal_labels = [normal_labels] * len(candidates) + + for idx in range(len(candidates)): + candidate = candidates[idx] + label = normal_labels[idx] + vcurve_x, vcurve_y, mu = boplot.draw_vertical_normal( + gp=gp, incumbenty=ymin, ax=ax, xtest=candidate, + xscale=2.0, yscale=1.0 + ) + + # ann_x = candidate + 0.5 * (np.max(vcurve_x) - candidate) / 2 + ann_x = candidate + # ann_y = ymin - 0.25 + ann_y = ymin - 3.0 + + arrow_x = candidate + 0.5 * (np.max(vcurve_x) - candidate) / 2 + arrow_x = ann_x + 0.1 + # arrow_y = ann_y - 3.0 + arrow_y = ymin - 3.0 + + # prob = "{:.2f}".format(candidate) + + ax.annotate( + s=label, xy=(ann_x, ann_y), xytext=(arrow_x, arrow_y), + # arrowprops={'arrowstyle': 'fancy', 'shrinkA': 20.0}, + # weight='heavy', color='darkgreen', zorder=zorders['annotations_high'] + ) + return fig, ax + + + # 3. Remove objective function. # -------------Plotting code ----------------- - fig, ax = plt.subplots(1, 1, squeeze=True) - ax.set_xlim(bounds["x"]) - ax.set_ylim(bounds["gp_y"]) - ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) - boplot.plot_objective_function(ax=ax) - boplot.mark_observations(X_=x, Y_=y, mark_incumbent=True, highlight_datapoint=None, highlight_label=None, ax=ax) - boplot.darken_graph(y=ymin, ax=ax) - - candidate = 5.0 - vcurve_x, vcurve_y, mu = boplot.draw_vertical_normal( - gp=gp, incumbenty=ymin, ax=ax, xtest=candidate, - xscale=2.0, yscale=1.0 - ) + fig, ax = draw_basic_plot(mark_incumbent=True, show_objective=False) - ann_x = candidate + 0.5 * (np.max(vcurve_x) - candidate) / 2 - ann_y = mu - 0.25 + finishing_touches(ax, show_legend=True, figname="pi_3.pdf") - arrow_x = ann_x - arrow_y = ann_y - 3.0 + # ------------------------------------------- - label = "{:.2f}".format(candidate) + # 4. Mark Zone of Probable Improvement (without legend and objective) + # -------------Plotting code ----------------- - ax.annotate( - s=r'$PI^{(t)}(%s)$' % label, xy=(ann_x, ann_y), xytext=(arrow_x, arrow_y), - arrowprops={'arrowstyle': 'fancy'}, - weight='heavy', color='darkgreen', zorder=15 - ) + fig, ax = draw_final_graph(show_objective=False, show_vertical_normals=False) - candidate = 8.0 - vcurve_x, vcurve_y, mu = boplot.draw_vertical_normal( - gp=gp, incumbenty=ymin, ax=ax, xtest=candidate, - xscale=2.0, yscale=1.0 - ) + finishing_touches(ax, show_legend=False, figname="pi_4.pdf") - ann_x = candidate + 0.5 * (np.max(vcurve_x) - candidate) / 2 - ann_y = mu - - arrow_x = ann_x - arrow_y = ann_y - 3.0 + # ------------------------------------------- - label = "{:.2f}".format(candidate) + # 5. Draw Vertical Normal at a good candidate for improvement + # -------------Plotting code ----------------- + fig, ax = draw_final_graph( + show_vertical_normals=True, + candidates=[5.0], + normal_labels=r'$PI^{(t)} \approx 0.5$' + ) - ax.annotate(s=r'$PI^{(t)}(%s)$' % label, xy=(ann_x, ann_y), xytext=(arrow_x, arrow_y), - arrowprops={'arrowstyle': 'fancy'}, - weight='heavy', color='darkgreen', zorder=15) + finishing_touches(ax, show_legend=False, figname="pi_5.pdf") + # ------------------------------------------- - ax.legend().set_zorder(20) - ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') + # 6. Draw Vertical Normal at a bad candidate for improvement + # -------------Plotting code ----------------- - ax.legend().remove() + fig, ax = draw_final_graph( + show_vertical_normals=True, + candidates=[5.0, 8.0], + normal_labels=[r'$PI^{(t)} \approx 0.5$', r'$PI^{(t)} \approx 0.0$'] + ) - plt.tight_layout() - if TOGGLE_PRINT: - plt.savefig("pi_5.pdf") - else: - plt.show() + finishing_touches(ax, show_legend=False, figname="pi_6.pdf") # ------------------------------------------- diff --git a/w06_hpo_bo/scripts/ts_plots.py b/w06_hpo_bo/scripts/ts_plots.py index 78e5680..417ace8 100644 --- a/w06_hpo_bo/scripts/ts_plots.py +++ b/w06_hpo_bo/scripts/ts_plots.py @@ -3,7 +3,7 @@ import argparse import logging from functools import partial - +import os.path import numpy as np from scipy.optimize import minimize from sklearn.gaussian_process import GaussianProcessRegressor as GPR @@ -14,15 +14,15 @@ import bo_plot_utils as boplot from bo_configurations import * - SEED = None TOGGLE_PRINT = False INIT_X_PRESENTATION = [2.5, 3.5, 5.5, 7, 9] +OUTPUT_DIR = os.path.abspath("./outputs/ts") bounds["x"] = (2, 13) bounds["gp_y"] = (-5, 5) -labels["xlabel"] = "$\lambda'$" -labels["gp_ylabel"] = "$c(\lambda')$" +labels["xlabel"] = "$\lambda$" +labels["gp_ylabel"] = "" def initialize_dataset(initial_design, init=None): """ @@ -93,24 +93,24 @@ def visualize_ts(initial_design, init=None): logging.debug("Model fit to dataset.\nOriginal Inputs: {0}\nOriginal Observations: {1}\n" "Predicted Means: {2}\nPredicted STDs: {3}".format(x, y, *(gp.predict(x, return_std=True)))) + + # 1. Plot GP fit on initial dataset # -------------Plotting code ----------------- fig, ax = plt.subplots(1, 1, squeeze=True) ax.set_xlim(bounds["x"]) ax.set_ylim(bounds["gp_y"]) ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) + boplot.plot_gp(model=gp, confidence_intervals=[1.0, 2.0, 3.0], custom_x=x, ax=ax) boplot.plot_objective_function(ax=ax) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, highlight_datapoint=None, highlight_label=None, ax=ax) - ax.legend().set_zorder(20) + ax.legend().set_zorder(zorders['legend']) ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') plt.tight_layout() if TOGGLE_PRINT: - plt.savefig("ts_1.pdf") + plt.savefig(f"{OUTPUT_DIR}/ts_1.pdf") else: plt.show() # ------------------------------------------- @@ -121,14 +121,14 @@ def visualize_ts(initial_design, init=None): ax.set_xlim(bounds["x"]) ax.set_ylim(bounds["gp_y"]) ax.grid() - boplot.plot_gp(model=gp, confidence_intervals=[2.0], custom_x=x, ax=ax) + boplot.plot_gp(model=gp, confidence_intervals=[1.0, 2.0, 3.0], custom_x=x, ax=ax) boplot.plot_objective_function(ax=ax) boplot.mark_observations(X_=x, Y_=y, mark_incumbent=False, highlight_datapoint=None, highlight_label=None, ax=ax) # Sample from the GP nsamples = 1 - seed2 = 1256 - seed3 = 65 + seed2 = 2 + seed3 = 1375 X_ = boplot.get_plot_domain(precision=None) mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3) boplot.plot_gp_samples( @@ -141,14 +141,12 @@ def visualize_ts(initial_design, init=None): seed=seed2 ) - ax.legend().set_zorder(20) + ax.legend().set_zorder(zorders['legend']) ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') plt.tight_layout() if TOGGLE_PRINT: - plt.savefig("ts_2.pdf") + plt.savefig(f"{OUTPUT_DIR}/ts_2.pdf") else: plt.show() # ------------------------------------------- @@ -167,6 +165,7 @@ def visualize_ts(initial_design, init=None): nsamples = 1 X_ = boplot.get_plot_domain(precision=None) mu = gp.sample_y(X=X_, n_samples=nsamples, random_state=seed3) + colors['highlighted_observations'] = 'red' # Special for Thompson Sampling boplot.plot_gp_samples( mu=mu, nsamples=nsamples, @@ -182,18 +181,16 @@ def visualize_ts(initial_design, init=None): cost = mu[min_idx] boplot.highlight_configuration(x=np.array([candidate]), label='', lloc='bottom', disable_ticks=True, ax=ax) - boplot.annotate_x_edge(label=r'$\lambda^{(t)}$',xy=(candidate, cost), align='bottom', ax=ax) + boplot.annotate_x_edge(label=r'$\lambda^{(t)}$',xy=(candidate, cost), align='top', ax=ax) boplot.highlight_output(y=np.array([cost]), label='', lloc='left', disable_ticks=True, ax=ax) - boplot.annotate_y_edge(label=r'${\hat{c}}^{(t)}(\lambda^{(t)})$', xy=(candidate, cost), align='left', ax=ax) + boplot.annotate_y_edge(label=r'$g(\lambda^{(t)})$', xy=(candidate, cost), align='left', ax=ax) - ax.legend().set_zorder(20) + ax.legend().set_zorder(zorders['legend']) ax.set_xlabel(labels['xlabel']) - ax.set_ylabel(labels['gp_ylabel']) - ax.set_title(r"Visualization of $\mathcal{G}^{(t)}$", loc='left') plt.tight_layout() if TOGGLE_PRINT: - plt.savefig("ts_3.pdf") + plt.savefig(f"{OUTPUT_DIR}/ts_3.pdf") else: plt.show() # ------------------------------------------- diff --git a/w06_hpo_bo/t00_main.tex b/w06_hpo_bo/t00_main.tex index d7828f9..9087f29 100644 --- a/w06_hpo_bo/t00_main.tex +++ b/w06_hpo_bo/t00_main.tex @@ -1,26 +1,35 @@ -%TODO change PATH for the github structure -% \input{../latex_main/main} -\input{../latex_main/main.tex} +% TODO: change the PATH +%\input{../latex_main/main.tex} +\input{latex_main/main.tex} +% FH: I created this command videotitle (and file title_slide.tex) to show a new title slide for each video without the need of any repeating boiler plate code. Please do not change this anymore. +\newcommand{\videotitle}[1]{\subtitle{#1}\section{#1}\input{title_slide}} % Bayesian Optimization SS 2020 macros -%\input{../latex_main/macros_bo.tex} +% \input{latex_main/macros_bo.tex} +% there is no need to include macros_bo.tex unless new macros are added to it. This file is just a temporary holder of new macros until they are pushed upstream into the main macros.tex file on git, and will never be pushed to git. -% allowing to align images inside a figure -\usepackage[export]{adjustbox} +%\usepackage[export]{adjustbox} % setting notes visibility -\usepackage{pgfpages} -\setbeameroption{hide notes} % Only slides -%\setbeameroption{show only notes} % Only notes -%\setbeameroption{show notes on second screen=right} % Both - -%TODO: change titles -\title[AutoML: Bayesian Optimization for HPO]{AutoML: Bayesian Optimization for HPO} % week title -\subtitle{Overview of the Bayesian optimization loop} % video title -%TODO: change authors! +%\usepackage{pgfpages} +%\setbeameroption{hide notes} % Only slides + + +% \AtBeginSection[] % Do nothing for \section* +% { +% \begin{frame}{Outline} +% \bigskip +% \vfill +% \tableofcontents[currentsection] +% \end{frame} +% } + + +\title[AutoML: Bayesian Optimization for HPO]{AutoML: Bayesian Optimization for Hyperparameter Optimization} % week title \author[Marius Lindauer]{Bernd Bischl \and \underline{Frank Hutter} \and Lars Kotthoff\newline \and Marius Lindauer \and Joaquin Vanschoren} \institute{} \date{} + \AtBeginSection[] % Do nothing for \section* { \begin{frame}{Outline} @@ -28,28 +37,25 @@ \vfill \tableofcontents[currentsection] \end{frame} - \begin{frame} - \vfill - \centering - \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title} - \usebeamerfont{title}\insertsectionhead\par% - \end{beamercolorbox} - \vfill - \end{frame} } + \begin{document} - - \maketitle - - %TODO: change PATH for the github structure + + \begin{frame}{Outline} + \bigskip + \vfill + \tableofcontents + \end{frame} + \input{t01_introduction.tex} - \input{t03_acq_functions_1.tex} + \input{t02_acq_functions_1.tex} \input{t03_acq_functions_2.tex} \input{t04_surrogate_models.tex} - \input{t05_extensions.tex} - \input{t06_tpe.tex} - \input{t07_success_stories.tex} + \input{t05_high_dim_BO.tex} + \input{t06_extensions.tex} + \input{t07_tpe.tex} + \input{t08_success_stories.tex} \input{t99_bibliography} \end{document} diff --git a/w06_hpo_bo/t01_introduction.pdf b/w06_hpo_bo/t01_introduction.pdf new file mode 100644 index 0000000..a550650 Binary files /dev/null and b/w06_hpo_bo/t01_introduction.pdf differ diff --git a/w06_hpo_bo/t01_introduction.tex b/w06_hpo_bo/t01_introduction.tex index 5f6353b..9b93992 100644 --- a/w06_hpo_bo/t01_introduction.tex +++ b/w06_hpo_bo/t01_introduction.tex @@ -1,36 +1,35 @@ -\section{Introduction to Bayesian Optimization} -%---------------------------------------------------------------------- -\begin{frame}[c]{Introduction to Bayesian Optimization} -\framesubtitle{Global Optimization} - -Consider a \emph{well behaved} function $\func$ : $\pcs \rightarrow \realnum$ where $\pcs \subseteq \realnum^D$ is a bounded domain. Our goal is to find -% -\begin{equation*} - \optconf=\argmin_{\conf\in\pcs} \func(\conf). -\end{equation*} -\vspace{-0.6cm} +\videotitle{Introduction to Bayesian Optimization} + +%----------------------------------------------------------------------- +\myframetop{Blackbox Optimization for Hyperparameter Optimization}{ + + \myit{ + \item Consider the \alert{global optimization problem} of finding: + \[\conf^* \in \argmin_{\conf \in \confs} f(\conf)\] + + \item In the most general form, function $f$ is a \alert{blackbox function}: + \begin{center} +\scalebox{0.5}{\input{images/intro_images/blackbox_HPO.tex}} + \end{center} + \myit{ + \item Only mode of interaction with $f$: querying $f$'s value at a given $\conf$ + \item Function $f$ may not be available in closed form, not differentiable, noisy, etc. + } +\medskip +\pause + + \item Today, we'll discuss a \alert{Bayesian} approach for solving such blackbox optimization problems +\medskip +\pause + \item Blackbox optimization can be used for hyperparameter optimization (HPO) + \myit{ + \item Define \alert{$f(\conf) := \mathcal{L}( \mathcal{A}_{\conf}, \mathcal{D}_{train}, \mathcal{D}_{valid} )$} + \pause + \item Note: for formulations of HPO that go beyond blackbox optimization, see next lecture + } + } +} -\begin{columns}[T] -\column{0.4\textwidth} -\begin{itemize} - \item Function $\func$ is explicitly unknown - this is called a black box function - \item and can be multimodal - \item Only mode of interaction: Query $\conf$ to obtain a potentially noisy observation $\cost(\conf) = \func(\conf) + \epsilon$ - \item Evaluations are expensive - \item There is no gradient information available -\end{itemize} -$\rightarrow$ For the remainder of this lecture we will \emph{minimize} $\cost$ -% -\column{0.6\textwidth} -\begin{figure} - \begin{multicols}{2} - \includegraphics[width=0.4\textwidth, right]{images/intro_images/branin.png} - \includegraphics[width=0.4\textwidth,left]{images/intro_images/branin_countour.png} - \end{multicols} -\end{figure} -\source{\href{https://uqworld.org/t/branin-function/53}{Branin}} -\end{columns} -\end{frame} %---------------------------------------------------------------------- %\begin{frame}[c]{Optimization problem example} @@ -149,40 +148,70 @@ \section{Introduction to Bayesian Optimization} %\end{frame} %---------------------------------------------------------------------- -\begin{frame}[c]{Introduction to Bayesian Optimization} -\framesubtitle{In a nutshell} +\myframetop{Bayesian Optimization of a blackbox function in a nutshell}{ + +\bigskip +\bigskip +\bigskip + + \onslide<1-> + \begin{figure} + \vspace{-1em} + \centering + \only<1>{ + \includegraphics[width=0.95\textwidth]{w06_hpo_bo/images/intro_images/IntroPlots_Obs.pdf} + }\only<2>{ + \includegraphics[width=0.95\textwidth]{w06_hpo_bo/images/intro_images/IntroPlots_GP.pdf} + }\only<3>{ + \includegraphics[width=0.95\textwidth]{w06_hpo_bo/images/intro_images/IntroPlots_Acqui.pdf} + }\only<4->{ + \includegraphics[width=0.95\textwidth]{w06_hpo_bo/images/intro_images/IntroPlots_Complete.pdf} + } + \end{figure} + +% \vspace*{-0.5cm}\notefh{Can you please increase the plotted values of the acquisition function, so that it is more clearly visible? Also on the next slide. You could, e.g., normalize it to have a certain maximum (a bit larger than that of the third plot on the next slide). Also, on the next slide, can you please plot the new observation red, not green?} +} + +%---------------------------------------------------------------------- +\begin{frame}[c]{Bayesian Optimization of a blackbox function in a nutshell} \begin{columns}[T] \column{0.45\textwidth} General approach \begin{itemize} - \item <2-> Fit a \emph{probabilistic model} to the collected function samples $\langle{}\conf, \cost(\conf)\rangle{}$ - \item <3-> Use the model to guide optimization, trading off \emph{exploration \vs{} exploitation} + \item Fit a \alert{probabilistic model} to the collected function samples $\langle{}\conf, \cost(\conf)\rangle{}$ + \item Use the model to guide optimization, trading off \alert{exploration \vs{} exploitation} % \item Acquisition function for exploration-exploitation tradeoff % \item Optimize on acquisition function\\ to get next $x$ $\conf$ ($x$) \end{itemize} -\onslide<11-> Popular approach in the statistics literature since \lit{\href{http://link.springer.com/chapter/10.1007\%2F3-540-07165-2_55}{Mockus et al. 1978}} -\begin{itemize} - \item <12-> Efficient in \emph{\#function evaluations} - \item <13-> Works when objective is \emph{nonconvex, noisy, has unknown derivatives, etc.} - \item <14-> Recent \emph{convergence} results\\ \lit{\href{https://arxiv.org/abs/0912.3995}{Srinivas et al. 2009}; \href{http://www.jmlr.org/papers/v12/bull11a.html}{Bull et al. 2011}; \href{https://www.cs.ubc.ca/~nando/papers/BayesBandits.pdf}{de Freitas et al. 2012}; \href{http://papers.nips.cc/paper/5715-bayesian-optimization-with-exponential-convergence}{Kawaguchi et al. 2015}} -% \item Popular \alert{Bayesian optimization workshop} at NIPS (the premiere machine learning conference) -\end{itemize} +\bigskip + +\onslide<4->{ + \alert{Popular approach in the statistics literature} since + \href{http://link.springer.com/chapter/10.1007\%2F3-540-07165-2_55}{\footnotesize\color{black!70} Mockus et al. [1978]} + \begin{itemize} + \item Efficient in \#function evaluations + \item Works when objective is \alert{nonconvex, noisy, has unknown derivatives, etc.} + \item Recent \alert{convergence} results\\ \lit{\href{https://arxiv.org/abs/0912.3995}{Srinivas et al. 2009}; \href{http://www.jmlr.org/papers/v12/bull11a.html}{Bull et al. 2011}; \href{https://www.cs.ubc.ca/~nando/papers/BayesBandits.pdf}{de Freitas et al. 2012}; \href{http://papers.nips.cc/paper/5715-bayesian-optimization-with-exponential-convergence}{Kawaguchi et al. 2015}} + % \item Popular \alert{Bayesian optimization workshop} at NIPS (the premiere machine learning conference) + \end{itemize} +} \column{0.55\textwidth} \onslide<1-> \begin{figure} + \vspace{-1em} \centering - \only<1>{\includegraphics[width=\textwidth]{images/intro_images/BOLoop_Initial_Points.pdf}} - \only<2>{\includegraphics[width=\textwidth]{images/intro_images/BOLoop_Initial_Points_and_GP.pdf}} - \only<3>{\includegraphics[width=\textwidth]{images/intro_images/plot_1.pdf}} - \only<4>{\includegraphics[width=\textwidth]{images/intro_images/plot_2.pdf}} - \only<5>{\includegraphics[width=\textwidth]{images/intro_images/plot_3.pdf}} - \only<6>{\includegraphics[width=\textwidth]{images/intro_images/plot_4.pdf}} - \only<7>{\includegraphics[width=\textwidth]{images/intro_images/plot_5.pdf}} - \only<8>{\includegraphics[width=\textwidth]{images/intro_images/plot_6.pdf}} - \only<9->{\includegraphics[width=\textwidth]{images/intro_images/plot_7.pdf}} + \onslide<1->{\includegraphics[width=0.8\textwidth]{w06_hpo_bo/images/intro_images/IntroPlots_Iter2.pdf}} + \onslide<2->{\includegraphics[width=0.8\textwidth]{w06_hpo_bo/images/intro_images/IntroPlots_Iter3.pdf}} + \onslide<3->{\includegraphics[width=0.8\textwidth]{w06_hpo_bo/images/intro_images/IntroPlots_Iter4.pdf}} + %\only<4>{\includegraphics[width=\textwidth]{images/intro_images/plot_2.pdf}} + %\only<5>{\includegraphics[width=\textwidth]{images/intro_images/plot_3.pdf}} + %\only<6>{\includegraphics[width=\textwidth]{images/intro_images/plot_4.pdf}} + %\only<7>{\includegraphics[width=\textwidth]{images/intro_images/plot_5.pdf}} + %\only<8>{\includegraphics[width=\textwidth]{images/intro_images/plot_6.pdf}} + %\only<9->{\includegraphics[width=\textwidth]{images/intro_images/plot_7.pdf}} \end{figure} \end{columns} @@ -256,14 +285,13 @@ \section{Introduction to Bayesian Optimization} %\end{frame} %---------------------------------------------------------------------- -\begin{frame}[c]{Introduction to Bayesian Optimization} -\framesubtitle{Pseudocode} +\begin{frame}[c]{Bayesian Optimization: Pseudocode} \begin{center} \begin{minipage}{0.75\textwidth} \begin{algorithm}[H] %\DontPrintSemicolon - \SetAlgoLined +% \SetAlgoLined \setcounter{AlgoLine}{0} \SetKwInOut{Require}{Require} \SetKwInOut{Result}{Result} @@ -272,53 +300,53 @@ \section{Introduction to Bayesian Optimization} cost function $\cost$, acquisition function $\acq$, predictive model $\surro$, maximal number of function evaluations $\bobudget$} - \Result{Best observed configuration $\finconf$ according to $\iter[\bobudget]{\dataset}$ or $\surro$} + \Result{Best configuration $\finconf$ + (according to $\dataset$ or + $\surro$)} - $\iter[0]{\dataset} \leftarrow \varnothing$\; + Initialize data $\iter[0]{\dataset}$ with initial observations\;% \leftarrow \varnothing$\; \For{$\bocount=1$ \KwTo $\bobudget$}{ %\While{$B$ not exhausted} { - $\iter[\bocount]{\surro}$ $\leftarrow$ fit predictive model on $\iter[\bocount-1]{\dataset}$\; + Fit predictive model $\iter[\bocount]{\surro}$ on $\iter[\bocount-1]{\dataset}$\; - $\bonextsample \leftarrow \bonextsample \in \argmax_{\conf \in \pcs} \acq(\conf; \iter[\bocount-1]{\dataset}, \iter[\bocount]{\surro})$\; + Select next query point: $\bonextsample \in \argmax_{\conf \in \pcs} \acq(\conf; \iter[\bocount-1]{\dataset}, \iter[\bocount]{\surro})$\; Query $\bonextobs$\; - $\iter[\bocount]{\dataset} \leftarrow \iter[\bocount-1]{\dataset} \cup \{\langle \bonextsample, \bonextobs \rangle \}$\; + Update data: $\iter[\bocount]{\dataset} \leftarrow \iter[\bocount-1]{\dataset} \cup \{\langle \bonextsample, \bonextobs \rangle \}$\; } - \caption{BO loop} + \caption*{BO loop} \end{algorithm} \end{minipage} \end{center} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Introduction to Bayesian Optimization} -\framesubtitle{Where does the name come from?} +\begin{frame}[c]{Bayesian Optimization: Origin of the Name} \begin{itemize} - \item <+-> Bayesian optimization uses Bayes' theorem: + \item Bayesian optimization uses \alert{Bayes' theorem}: \begin{equation*} - P(A \vert B) = \frac{P(B \wedge A) \times P(A)}{P(B)} + P(A \vert B) = \frac{P(B \vert A) \times P(A)}{P(B)} \propto P(B \vert A) \times P(A) \end{equation*} - \item <+-> We refer to: - \begin{itemize} - \item $A$ as a model (or hypothesis, theory), - \item $B$ as a data (or observations, evidence), - \item $P(A \vert B)$ as a \emph{posterior} probability of a model given a data, - \item $P(B \vert A)$ as a \emph{likelihood} of a data given a model, - \item $P(A)$ as a \emph{prior} probability of a model, which represents our belief about the space of possible objective functions. - \end{itemize} - \item <+-> In our application: + \item Bayesian optimization uses this to compute a posterior over functions: \begin{equation*} - P(\func \vert \dataset_{1:\bocount}) \propto P(\dataset_{1:\bocount} \vert \func) \times P(\func) + P(\func \vert \dataset_{1:\bocount}) \propto P(\dataset_{1:\bocount} \vert \func) \times P(\func), \text{~~~~ where } \dataset_{1:\bocount} = \left \{ \conf_{1:\bocount}, \cost(\conf_{1:\bocount}) \right\} \end{equation*} - where $\dataset_{1:\bocount} = \left \{ \conf_{1:\bocount}, \cost(\conf_{1:\bocount}) \right \}$. -\end{itemize} +\pause +\vspace*{-0.5cm} + \item Meaning of the individual terms: + \begin{itemize} + \item $P(f)$ is the \alert{prior} over functions, which represents our belief about the space of possible objective functions \alert{before} we see any data + \item $\dataset_{1:\bocount}$ is the \alert{data} (or observations, evidence) + \item $P(\dataset_{1:\bocount} \vert \func)$ is the likelihood of the data given a function + \item $P(\func \vert \dataset_{1:\bocount})$ is the \alert{posterior} probability over functions given the data + \end{itemize} + \end{itemize} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Introduction to Bayesian Optimization} -\framesubtitle{Advantages and Disadvantages} +\begin{frame}[c]{Bayesian Optimization: Advantages and Disadvantages} \begin{columns}[T] % align columns \begin{column}{.48\textwidth} @@ -343,7 +371,7 @@ \section{Introduction to Bayesian Optimization} \begin{block}{Disadvantages} \begin{itemize} \item Overhead because of model training in each iteration - \item Open design choices: surrogate model, acquisition function + \item Crucially relies on robust surrogate model \item Inherently sequential (in its basic form) \end{itemize} \end{block} @@ -354,13 +382,17 @@ \section{Introduction to Bayesian Optimization} \end{frame} %----------------------------------------------------------------------- \begin{frame}[c]{Learning Goals of this Lecture} +\framesubtitle{After this lecture, students can ...} -\begin{enumerate} - \item Learn how Bayesian optimization works and can be used for HPO. - \item Learn about the two main ingredients of Bayesian optimization: Acquisition Functions and Surrogate Models. - \item Learn the limits of Bayesian optimization and the extensions which tackle these. - \item Know success stories of Bayesian optimization. -\end{enumerate} +\begin{itemize} + \item Explain the basics of Bayesian optimization + \item Derive \alert{simple acquisition functions} + \item Describe \alert{advanced acquisition functions} + \item Describe possible \alert{surrogate models} and their pros and cons + \item Discuss the \alert{limits of Bayesian optimization} and extensions to tackle these +% \item Describe the \alert{alternative Bayesian optimization approach of TPE} + \item Discuss \alert{success stories} of Bayesian optimization +\end{itemize} \end{frame} @@ -382,3 +414,5 @@ \section{Introduction to Bayesian Optimization} %\end{frame} %----------------------------------------------------------------------- + +%\end{document} \ No newline at end of file diff --git a/w06_hpo_bo/t02_acq_functions_1.pdf b/w06_hpo_bo/t02_acq_functions_1.pdf new file mode 100644 index 0000000..02a642d Binary files /dev/null and b/w06_hpo_bo/t02_acq_functions_1.pdf differ diff --git a/w06_hpo_bo/t02_acq_functions_1.tex b/w06_hpo_bo/t02_acq_functions_1.tex new file mode 100644 index 0000000..c79a6fe --- /dev/null +++ b/w06_hpo_bo/t02_acq_functions_1.tex @@ -0,0 +1,329 @@ +\videotitle{Computationally Cheap Acquisition Functions} + +%---------------------------------------------------------------------- +\begin{frame}[c]{Acquisition Functions: the Basics} +\begin{itemize} + \item Given the surrogate model $\iter{\surro}$ at the $\bocount$-th iteration of BO, the \\ + \alert{acquisition function $\acq(\cdot)$ judges the utility (or usefulness) of evaluating $f$ at $\iter{\conf}\in \pcs$ next} + \pause + \bigskip + \item The acquisition function needs to \alert{trade off exploration and exploitation} + \myit{ + \item E.g., just picking the $\conf$ with lowest predicted mean would be too greedy + \item We also need to take into account the uncertainty of the surrogate model $\iter{\surro}$ to explore + } +\end{itemize} + +\end{frame} +%----------------------------------------------------------------------- +\myframetop{Probability of Improvement (PI): Concept}{ + %\framesubtitle{Probability of Improvement - Concept} + % \begin{figure} + \centering + \begin{tikzpicture} + \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_1.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Given the surrogate fit at iteration $\bocount$}; + + \node<+> (img2) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_2.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img2, align=center]{Current incumbent $\hat{\conf}$ and its observed cost $\cost_{inc}$}; + + + \node<+> (img3) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_3.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img3, align=center]{Now let's drop the objective function - it's unknown after all!}; + + + \node<+> (img4) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_4.pdf}}; + \node<.> [below=-1.0\belowcaptionskip of img4, align=center]{Intuitively, we care about the probability of improving over the current incumbent}; + \comment{We cannot be absolutely certain if there will be an improvement, but we are certain that if there is to be improvement, it is only possible in this zone.} + + \node<+> (img5) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_5.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img5, align=center]{PDF of a good candidate configuration. Only the green area is an improvement.}; + + \node<+> (img6) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_6.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img6, align=center]{PDF of a bad candidate configuration}; + \end{tikzpicture} + % \end{figure} +} +% ----------------------------------------------------------------------- +\begin{frame}[c]{Probability of Improvement (PI): Formal Definition} +%\framesubtitle{Probability of Improvement - Choosing a candidate} +%\comment{The definitions were adapted from the source to fit an acquisition function that is maximized and an objective function which is to be minimized.!} +\begin{itemize} + \item We define the \alert{current incumbent at time step $t$} as: + $\incumbent[\bocount-1]\in\argmin_{\conf'\in\iter[\bocount-1]{\dataset}}\obs[\conf']$ + \item We write \alert{$\cost_{inc}$} shorthand for the \alert{cost of the current incumbent}: + $c_{inc} = \cost(\incumbent[\bocount-1])$ +\smallskip + \item The \alert{probability of improvement $\acq_{PI}(\conf)$} at a configuration $\conf$ is then defined as: + \alert{\[\iter{\acq}_{PI}(\conf) = P(\cost(\conf) \leq \cost_{inc}).\]} + \vspace*{-0.5cm} + \pause + \item Since the predictive distribution for $\cost(\conf)$ is a Gaussian $\normaldist(\iter[\bocount-1]{\mean}(\conf), \iter[\bocount-1]{\variance}(\conf))$, this can be written as: + \[ + \alert{\iter{\acq}_{PI}(\conf) = \cdf[Z]}, \quad \text{with } Z = \dfrac{\cost_{inc} - \iter[\bocount-1]{\mean}(\conf) - \xi}{\iter[\bocount-1]{\stddev}(\conf)}, + \] + \newline + where $\cdf(\cdot)$ is the CDF of the standard normal distribution and $\xi$ is an optional exploration parameter + \pause + \item[] \[\boxed{\text{Choose}\;\;\bonextsample \in \argmax_{\conf\in\pcs}(\iter{\acq}_{PI}(\conf))}\] +% \comment{Source: Tutorial by Brochu et al.: https://arxiv.org/pdf/1012.2599.pdf } +\end{itemize} +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[t]{Expected Improvement (EI): Concept} +%\framesubtitle{Expected Improvement - Concept} + +% \begin{figure} + \centering + \begin{tikzpicture} + \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_1.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Given the surrogate fit at iteration $\bocount$}; + + \node<+> (img2a) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_2a.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img2a, align=center]{Region of probable improvement -- but \alert{how large} is the improvement?}; + + \node<+> (img2b) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_2b.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img2b, align=center]{Region of probable improvement -- but \alert{how large} is the improvement?}; + + \node<+> (img3) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_3.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img3, align=center]{Hypothetical \emph{real} cost $c$ at a given $\conf$ - unknown in practice without evaluating}; + + \node<+> (img4) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_4.pdf}}; + \node<.> [below=-0.01\belowcaptionskip of img4, align=center]{Given a hypothetical $c$, we can improve the improvement $I_c(\conf)$}; +% Without performing an actual evaluation, we cannot calculate $\iter{I}(\conf)$}; + + \node<+> (img5) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_5.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img5, align=center]{Given $\surro(\conf) = \normaldist( \mean(\conf), \variance(\conf))$, we can also compute $p(\cost|\conf)$.}; + + \node<+> (img6) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_6.pdf}}; + \node<.> [below=-0.01\belowcaptionskip of img6, align=center]{Compare the likelihood of a given improvement for two different configurations $\conf_1$ and $\conf_2$}; + + \node<+> (img7) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_7.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img7, align=center]{Now consider the likelihood of a larger improvement.}; + + \node<+> (img8) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_8.pdf}}; + \node<.> [below=-0.01\belowcaptionskip of img8, align=center]{Larger improvements are more likely in areas of high uncertainty.\\ To compute $\E[I(\conf)]$, intuitively, we sum $p(\cost \mid \conf) \times I_\cost$ over all possibles values of $\cost$. + %\\We can thus use $\surro(\conf) = \normaldist( \mean(\conf), \variance(\conf))$ to calculate $\E[\iter{I}(\conf)]$. + }; + + \end{tikzpicture} +% \end{figure} + +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Expected Improvement (EI): Formal Definition} +%\framesubtitle{Expected Improvement - Choosing a candidate} + \begin{itemize}\abovedisplayskip=0em\belowdisplayskip=-0.75em + \item We define the one-step positive \alert{improvement over the current incumbent} as + \smallskip + \[ + \alert{\iter{I}(\conf) = \max(0, \cost_{inc} - \cost(\conf))} + \] +% \comment{This is probably a great time to point out, once again, that because I is defined in terms of the actual cost function, we cannot directly compute it.} + \smallskip + \item Expected Improvement is then defined as \alert{\[\iter{\acq}_{EI}(\conf) = \E[\iter{I}(\conf)] = \int_{-\infty}^{\infty} \iter{p}(\cost \mid \conf) \times \iter[\bocount]{I}(\conf)\;\; d\cost.\]} + \pause + \smallskip + \item Since the posterior distribution of $\surro(\conf)$ is a Gaussian, EI can be computed in closed form (see exercise): + +% \comment{Maybe emphasize that this is actually how and where the dependence on the actual cost function is replaced with a dependence on the surrogate.} + \begin{align*} + \alert{\iter{\acq}_{EI}(\conf)} &\alert{=} + \begin{cases} + \alert{\iter{\stddev}(\conf)[Z\cdf(Z) + \pdf(Z)]}, & \text{if }\iter{\stddev}(\conf) > 0 \\ + 0 & \text{if }\iter{\stddev}(\conf) = 0, + \end{cases}\\ + \text{where }Z &=\dfrac{\cost_{inc} - \iter{\mean}(\conf) - \xi}{\iter{\stddev}(\conf)} + \text{ and } \xi \text{ is an optional exploration parameter.} + \end{align*} +% \comment{I believe I needed to switch the signs of $\cost(\cdot)$ and $\mean(\cdot)$ as compared to the reference paper in order to accommodate for our convention of minimization/maximization. Please cross-check!} + \pause + \bigskip + \[\boxed{\text{Choose}\;\;\bonextsample \in \argmax_{\conf\in\pcs}(\iter{\acq}_{EI}(\conf))} + \] + \end{itemize} +\end{frame} +%----------------------------------------------------------------------- +%\begin{frame}[c]{Computationally Cheap Acquisition Functions - EI} +%\framesubtitle{Expected Improvement - Choosing a candidate} +%\comment{Verify if formulae agree with minimizing the surrogate.} +% \begin{itemize}\abovedisplayskip=0pt\belowdisplayskip=-0.5em +% \item[] We first define one-step improvement over the current incumbent, as +% \smallskip +% \[ +% \iter{I}(\conf) = \max(0, \cost(\incumbent[\bocount-1]) - \cost(\conf)), \quad\incumbent[\bocount-1]\in\argmin_{\conf'\in\iter[\bocount-1]{\dataset}}\obs[\conf']\in\iter[\bocount-1]{\dataset} +% \] +% \comment{This is probably a great time to point out, once again, that because I is defined in terms of the actual cost function, we cannot directly compute it.} +% \pause +% \medskip +% \item[] Expected Improvement is then defined as +% \begin{align*} +% \iter{\acq}_{EI}(\conf) &= \E[\iter{I}(\conf)]\\ +% &= \int_{\iter{I}=0}^{\iter{I}=\infty}\iter{I} P(\iter{I})d\iter{I} +% \end{align*} +% \pause +% \medskip +% \item[]Since the posterior distribution of the surrogate is a Gaussian, it can be shown that the distribution on $\iter{I}(\conf)$ is also a Gaussian, defined as +% \[ +% P(\iter{I}) = +% \dfrac{1}{\sqrt{2\pi}\iter{\stddev}(\conf)}\exp{\left[-\dfrac{{(\cost(\incumbent[\bocount-1])-\iter{\mean}(\conf)-\iter{I})}^2}{2\iter{\left(\variance\right)}(\conf)} +% \right]} +% \] +% \comment{Maybe emphasize that this is actually how and where the dependence on the actual cost function is replaced with a dependence on the surrogate.} +% \end{itemize} +%\end{frame} +%----------------------------------------------------------------------- +% \begin{frame}[c]{Computationally Cheap Acquisition Functions - EI} +% \framesubtitle{Expected Improvement - Choosing a candidate} +% \begin{align*} +% \action<+->{\iter{\acq}_{EI}(\conf) &= \int_{\iter{I}=0}^{\iter{I}=\infty}\iter{I} \dfrac{1}{\sqrt{2\pi}\iter{\stddev}(\conf)}\exp{-\dfrac{{(\cost(\incumbent[\bocount-1])-\iter{\mean}(\conf)-\iter{I})}^2}{2\iter{\left(\variance\right)}(\conf)}}d\iter{I}\\} +% \action<+->{&= +% \begin{cases} +% (\cost(\incumbent) - \iter{\mean}(\conf) - \xi)\cdf(Z) + \iter{\stddev}(\conf) \pdf(Z), & \text{if }\iter{\stddev}(\conf) > 0 \\ +% 0 & \text{if }\iter{\stddev}(\conf) = 0 +% \end{cases}\\} +% \action<+->{\intertext{where }Z} \action<.->{&=\dfrac{\cost(\incumbent) - \iter{\mean}(\conf) - \xi}{\iter{\stddev}(\conf)}} +% \action<+->{\Aboxed{\bonextsample \in \argmax_{\conf\in\pcs}(\iter{\acq}_{EI}(\conf))}} +% \end{align*} +% % \comment{Source: Tutorial by Brochu et al.: https://arxiv.org/pdf/1012.2599.pdf } +% \end{frame} +%----------------------------------------------------------------------- +%\begin{frame}[c]{Computationally Cheap Acquisition Functions - EI} +%\framesubtitle{Expected Improvement - Choosing a candidate} +% \begin{align*} +% \action<+->{\iter{\acq}_{EI}(\conf) &= \int_{\iter{I}=0}^{\iter{I}=\infty}\iter{I} \dfrac{1}{\sqrt{2\pi}\iter{\stddev}(\conf)}\exp{-\dfrac{{(\cost(\incumbent[\bocount-1])-\iter{\mean}(\conf)-\iter{I})}^2}{2\iter{\left(\variance\right)}(\conf)}}d\iter{I}\\} +% \action<+->{&= +% \begin{cases} +% \iter{\stddev}(\conf)[Z\cdf(Z) + \pdf(Z)], & \text{if }\iter{\stddev}(\conf) > 0 \\ +% 0 & \text{if }\iter{\stddev}(\conf) = 0 +% \end{cases}\\ +% \intertext{where }Z &=\dfrac{\cost(\incumbent[\bocount-1]) - \iter{\mean}(\conf) - \xi}{\iter{\stddev}(\conf)}} +% \comment{I believe I needed to switch the signs of $\cost(\cdot)$ and $\mean(\cdot)$ as compared to the reference paper in order to accommodate for our convention of minimization/maximization. Please cross-check!} +% \action<+->{\Aboxed{\text{Choose}\,\bonextsample \in \argmax_{\conf\in\pcs}(\iter{\acq}_{EI}(\conf))}} +% \end{align*} +% \comment{Source: Tutorial by Brochu et al.: https://arxiv.org/pdf/1012.2599.pdf } +%\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[t]{Lower/Upper Confidence Bounds (LCB/UCB): Concept} + +% \begin{figure} + \centering + \begin{tikzpicture} + \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lcb/lcb_1.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Given the surrogate fit at iteration $\bocount$}; + %fit on dataset $\iter[\bocount-1]{\dataset}$}; + \node<+> (img2) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lcb/lcb_2.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img2, align=center]{Lower Confidence Bound, $\mean(\conf)-\alpha\stddev(\conf)$ (here, for $\alpha=3$)}; + \end{tikzpicture} +% \end{figure} + +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Lower/Upper Confidence Bounds (LCB/UCB): Formal Definition} +\begin{itemize} + \item We define the \alert{Lower Confidence Bound} as + \[\alert{\iter{\acq}_{LCB}(\conf) = \iter{\mean}(\conf) - \alpha\iter{\stddev}(\conf)},\quad\alpha\geq0\] + +\bigskip + \item One can schedule $\alpha$ (e.g., increase it over time \lit{\href{https://arxiv.org/pdf/0912.3995.pdf}{Srinivas et al. 2009}}) + +\[ + \boxed{\text{Choose}\;\;\bonextsample \in \argmax_{\conf\in\pcs}\left(\alert{-} \iter{\acq}_{LCB}(\conf)\right)} +\] + +\end{itemize} + \bigskip + \pause + + \myit{ + \item Note: when one aims to \alert{maximize} the objective function, one would use \alert{UCB} instead + \myit{ + \item $\iter{\acq}_{UCB}(\conf)) = \iter{\mean}(\conf) + \alpha\iter{\stddev}(\conf)$ + \item For UCB, one would choose $\bonextsample \in \argmax_{\conf\in\pcs}( \iter{\acq}_{UCB}(\conf))$ + } + } +% \item It has been shown that using the acquisition function +% \[\iter{\acq}_{GP-LCB}(\conf) = \iter{\mean}(\conf) - \sqrt{\nu\tau_t}\iter{\stddev}(\conf), \quad\nu>0,\] asymptotically results in zero cumulative regret with the appropriate choice of parameters $\tau$ and $\nu$ \lit{\href{https://arxiv.org/pdf/0912.3995.pdf}{Srinivas et al. 2009}}.} +% \comment{Trying to further explain the difference between LCB and GP-LCB's parameters would've overwhelmed the intuitiveness of the slide. Instead, a quick verbal note on the difference and pointing out the reference paper by Srinivas et al. for further reading should suffice.} + %\comment{Source: Tutorial by Brochu et al.: https://arxiv.org/pdf/1012.2599.pdf } +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[t]{Thompson Sampling (TS): Concept} + +% \begin{figure} + \centering + \begin{tikzpicture} + \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ts/ts_1.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Given the surrogate at iteration $\bocount$ fit on dataset $\iter[\bocount-1]{\dataset}$}; + \node<+> (img2) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ts/ts_2.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img2, align=center]{Draw a sample $g$ from the predictive surrogate model}; + \node<+> (img3) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ts/ts_3.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img3, align=center]{Then choose the minimum of this sample to evaluate at next}; + \end{tikzpicture} +% \end{figure} + +\end{frame} +%----------------------------------------------------------------------- +% \begin{frame}[c]{Computationally Cheap Acquisition Functions - TS} +% \framesubtitle{Thompson Sampling - Gist} +% +% \begin{itemize} +% \item Draw a sample $g$ from the GP $\iter{\gp}$. +% \item Choose $\bonextsample=\argmin_{\conf\in\pcs}(g(\conf))$ +% \end{itemize} +% \end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Thompson Sampling (TS): Pseudocode} + +\begin{center} +\begin{minipage}{0.75\textwidth} +\comment{Fix algorithm numbering} +\begin{algorithm}[H] + %\DontPrintSemicolon + \LinesNumbered +% \SetAlgoLined + \setcounter{AlgoLine}{0} + \SetKwInOut{Require}{Require} + \SetKwInOut{Result}{Result} + + \Require{Search space $\pcs$, + cost function $\cost$, + surrogate model $\surro$, + maximal number of function evaluations $\bobudget$} +\Result{Best observed configuration $\finconf$ according to $\iter[\bobudget]{\dataset}$ or $\gp$} + Initialize data $\iter[0]{\dataset}$ with initial observations\;% \leftarrow \varnothing$\; + + \For{$\bocount=1$ \KwTo $\bobudget$}{ + + Fit predictive model $\iter[\bocount]{\surro}$ on $\iter[\bocount-1]{\dataset}$\; + + \textcolor{blue}{Sample a function from the surrogate: $g\sim\iter{\surro}$}\; + + \textcolor{blue}{Select next query point: $\bonextsample \in \argmin_{\conf\in\pcs}g(\conf)$}\; + + Query $\bonextobs$\; + + Update data: $\iter[\bocount]{\dataset} \leftarrow \iter[\bocount-1]{\dataset} \cup \{\langle \bonextsample, \bonextobs \rangle \}$\; + } + \caption*{Bayesian Optimization using Thompson Sampling} +\end{algorithm} +\end{minipage} +\end{center} +%\comment{Source: Paper, Kandasamy et al, http://proceedings.mlr.press/v84/kandasamy18a/kandasamy18a.pdf} +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} + +\begin{itemize} +%PI + \item \alert{Discussion.} How would you set the exploration parameter $\xi$ for PI if you want to avoid too incremental improvements? +\medskip +%EI + \item \alert{Derivation.} Derive the closed form solution of expected improvement. +\medskip + \item \alert{Discussion.} In which situations would EI perform substantially differently than PI? +%LCB +%TS +\end{itemize} + +\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t02_bo_loop_overview.tex b/w06_hpo_bo/t02_bo_loop_overview.tex deleted file mode 100644 index 5d156e8..0000000 --- a/w06_hpo_bo/t02_bo_loop_overview.tex +++ /dev/null @@ -1,167 +0,0 @@ -\section{Bayesian Optimization} -%---------------------------------------------------------------------- -%---------------------------------------------------------------------- -\begin{frame}[c]{Bayesian Optimization: Bayes rule} - -\begin{itemize} - \item Let $A$ and $B$ be two events with $P(B) \neq 0$ \pause - \item The conditional probability of $A$ given $B$ is defined to be: - \begin{equation*} - P(A \vert B) = \frac{P(A \cap B)}{P(B)} - \label{eq:cond_prob} - \end{equation*} \pause - \item One can rearrange the terms to show: - \begin{equation*} - P(A \cap B) = P(A \vert B) * P(B) - \end{equation*} \pause -\end{itemize} - -\begin{center} -\begin{minipage}{0.75\textwidth} -\begin{block}{Bayes rule (theorem)} -Since $A \cap B = B \cap A$, one can rewrite above relation as: - \begin{equation*} - P(A \vert B) = \frac{P(B \vert A) * P(A)}{P(B)} - \label{eq:bayes_rule} - \end{equation*} -\end{block} -\end{minipage} -\end{center} - -\end{frame} -%----------------------------------------------------------------------- - -%---------------------------------------------------------------------- -%---------------------------------------------------------------------- -\begin{frame}[c]{Bayesian Optimization: Bayes rule - example} - -\begin{block}{Bayes rule - example} - You are planning a picnic today, but the morning is cloudy: \pause - \begin{itemize} - \item 50\% of all rainy days start off cloudy, \pause - \item cloudy mornings are common (about 40\% of days start cloudy), \pause - \item it is a dry month (only 3 of 30 days tend to be rainy, or 10\%). \pause - \end{itemize} - - \emph{What is the chance of rain during the day?} -\end{block} - - \pause - -\begin{block}{Bayes rule - solution} - \begin{equation*} - \begin{aligned} - P(RainyDay \vert CloudyMorning) = \frac{P(CloudyMorning \vert RainyDay) * P(RainyDay)}{P(CloudyMorning)} \\ \pause - P(RainyDay \vert CloudyMorning) = \frac{0.5 * 0.1}{0.4} = 0.125 - \end{aligned} - \end{equation*} -\end{block} - -\note[item]{source: https://www.mathsisfun.com/data/bayes-theorem.html} -\note[item]{https://www.countbayesie.com/blog/2015/2/18/bayes-theorem-with-lego} - -\end{frame} -%----------------------------------------------------------------------- - -%---------------------------------------------------------------------- -%---------------------------------------------------------------------- -\begin{frame}[c]{Bayesian Optimization: Where does the name come from?} - -\begin{itemize} - \item Bayesian optimization uses Bayes' theorem in a form: - \begin{equation*} - P(A \vert B) \propto P(B \vert A) * P(A) - \end{equation*} \pause - \item We refer to: - \begin{itemize} - \item $A$ as a model (or hypothesis, theory), \pause - \item $B$ as a data (or observations, evidence),\pause - \item $P(A \vert B)$ as a \emph{posterior} probability of a model given a data,\pause - \item $P(B \vert A)$ as a \emph{likelihood} of a data given a model, \pause - \item $P(A)$ as a \emph{prior} probability of a model, which represents our belief about the space of possible objective functions. \pause - \end{itemize} - \item In our application: - \begin{equation*} - P(\func \vert \dataset_{1:\bocount}) \propto P(\dataset_{1:\bocount} \vert \func) * P(\func) - \end{equation*} \pause - where $\dataset_{1:\bocount} = \left \{ \conf_{1:\bocount}, \func(\conf_{1:\bocount}) \right \}$. - -\end{itemize} - - -\end{frame} -%----------------------------------------------------------------------- - -%---------------------------------------------------------------------- -%---------------------------------------------------------------------- -\begin{frame}[c]{Bayesian Optimization: Pseudocode} -\begin{center} -\begin{minipage}{0.75\textwidth} -\begin{algorithm}[H] - \Input{Search Space $\pcs$, - black box function $\func$, - acquisition function $\acq$, \\ - maximal number of function evaluations $\bobudget$. - } - \BlankLine - $\dataset_0$ $\leftarrow$ initial\_design($\pcs$); - - \For{\bocount = $1, 2, \ldots \bobudget - |\dataset_0|$}{ - %\While{$B$ not exhausted} { - $\surro$ $\leftarrow$ fit predictive model on $\dataset_{\bocount-1}$; - - select $\bonextsample$ by optimizing $\bonextsample \in \argmax_{\conf \in \pcs} \acq(\conf; \dataset_{\bocount-1}, \surro)$; - - Query $\bonextobs := \func(\bonextsample)$; - - Add observation to data $\dataset_{\bocount} := \dataset_{\bocount-1} \cup \{\langle \bonextsample, \bonextobs \rangle \}$;\\ - } - \Return{Best $\conf$ according to $\dataset_\bocount$ or $\surro$} - \caption{BO loop} -\end{algorithm} -\end{minipage} -\end{center} -\note[item]{how to end lines?} -\end{frame} -%----------------------------------------------------------------------- - -%----------------------------------------------------------------------- -%----------------------------------------------------------------------- -\begin{frame}[c]{Bayesian Optimization: Summary} - -\begin{columns}[T] % align columns -\begin{column}{.48\textwidth} - -\only<1-9>{ - \begin{block}{Advantages} - \begin{itemize} - \item Sample efficient \pause - \item Native incorporation of priors \pause - \item Does not require local gradients nor Hessian approximations \pause - \item ... - \end{itemize} - \end{block} -} -\end{column}% - -\pause -\hfill% - -\begin{column}{.48\textwidth} -\only<4-9>{ - \begin{block}{Disadvantages} - \begin{itemize} - \item Overhead because of model training in each iteration \pause - \item Inherently sequential algorithm \pause - \item Requires good choice of surrogate model \pause - \item Requires good choice of acquisition function \pause - \item Has hyperparameter on its own - \end{itemize} -\end{block} -} -\end{column} -\end{columns} - - -\end{frame} -%----------------------------------------------------------------------- \ No newline at end of file diff --git a/w06_hpo_bo/t03_acq_functions_1.tex b/w06_hpo_bo/t03_acq_functions_1.tex deleted file mode 100644 index 0aa63be..0000000 --- a/w06_hpo_bo/t03_acq_functions_1.tex +++ /dev/null @@ -1,272 +0,0 @@ -\section{Computationally Cheap Acquisition Functions} -%---------------------------------------------------------------------- -%---------------------------------------------------------------------- -\begin{frame}[c]{Acquisition Functions} -\framesubtitle{Description} -\begin{itemize} - \item \emph{Problem:} Given the surrogate function $\iter{\surro}$ at the $\bocount\,$th iteration of BO, choose the "best" candidate $\iter{\conf}\in \pcs$ to evaluate $\cost$ at next - \pause - \smallskip - \item \emph{Solution(?):} Evaluate at a global minimum of $\iter{\surro}$ - \item \emph{Issues:} - \begin{itemize} - \item The surrogate function is inaccurate. The global optimum of $\iter{\surro}$ does not necessarily yield the \emph{best} $\cost(\conf)$. - \item Considering our uncertainty in the surrogate model, we also need to \emph{trade off exploration and exploitation} - \end{itemize} - \pause - \item \emph{Real Solution}: Use a heuristic utility function $\acq(\cdot)$ aka \emph{acquisition function} that trades off exploration and exploitation! -\end{itemize} - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[t]{Computationally Cheap Acquisition Functions - PI} -\framesubtitle{Probability of Improvement - Concept} -\begin{figure} - \centering - \begin{tikzpicture} - \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_1.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Given the GP at iteration $\bocount$ fit on dataset $\iter[\bocount-1]{\dataset}$}; - - \node<+> (img2) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_2.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img2, align=center]{Current incumbent $\incumbent[\bocount-1]$ and its observed cost $\cost(\incumbent[\bocount-1])$}; - - \node<+> (img3) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_3.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img3, align=center]{Intuitively, we can disregard the section of the search space with higher costs \\than our incumbent. Hence, we now only look at the \emph{Region of Probable Improvement}}; - \comment{We cannot be absolutely certain if there will be an improvement, but we are certain that if there is to be improvement, it is only possible in this zone.} - - \node<+> (img4) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_4.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img4, align=center]{PDF of a good candidate configuration. Only the green area is an improvement.}; - - \node<+> (img5) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/pi/pi_5.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img5, align=center]{PDF of a bad candidate configuration}; - \end{tikzpicture} -\end{figure} - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Cheap Acquisition Functions - PI} -\framesubtitle{Probability of Improvement - Choosing a candidate} -\comment{The definitions were adapted from the source to fit an acquisition function that is maximized and an objective function which is to be minimized.!} -\begin{itemize} - \item[] - \[ - \iter{\acq}_{PI}(\conf) = P(\cost(\conf) \leq \cost(\incumbent[\bocount-1])), \quad \text{where } \incumbent[\bocount-1]\in\argmin_{\conf'\in\iter[\bocount-1]{\dataset}}\obs[\conf']\in\iter[\bocount-1]{\dataset} - \] - \pause - \bigskip - \bigskip - This can be written as: - \vspace*{-1.0cm} - \[ - \iter{\acq}_{PI}(\conf) = \cdf[Z], \quad \text{with } Z = \dfrac{\cost(\incumbent[\bocount-1]) - \iter[\bocount-1]{\mean}(\conf) - \xi}{\iter[\bocount-1]{\stddev}(\conf)}, - \] - \newline - where $\cdf(\cdot)$ is the CDF of the standard normal distribution and $\xi$ is an optional exploration parameter. - \pause - \item[] \[\boxed{\text{Choose } \bonextsample \in \argmax_{\conf\in\pcs}(\iter{\acq}_{PI}(\conf))}\] -% \comment{Source: Tutorial by Brochu et al.: https://arxiv.org/pdf/1012.2599.pdf } -\end{itemize} -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[t]{Computationally Cheap Acquisition Functions - EI} -\framesubtitle{Expected Improvement - Concept} - -\begin{figure} - \centering - \begin{tikzpicture} - \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_1.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Given the GP at iteration $\bocount$ fit on dataset $\iter[\bocount-1]{\dataset}$}; - - \node<+> (img2) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_2.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img2, align=center]{Current incumbent $\incumbent[\bocount-1]$ and its observed cost $\cost(\incumbent[\bocount-1])$}; - - \node<+> (img3) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_3.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img3, align=center]{Region of Probable Improvement}; - - \node<+> (img4) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_4.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img4, align=center]{Now forget the objective function - it's unknown anyways!}; - - \node<+> (img5) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_5.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img5, align=center]{Hypothetical \emph{real} cost at a given $\conf$ - unknown in practice without evaluating}; - - \node<+> (img6) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_6.pdf}}; - \node<.> [below=-0.01\belowcaptionskip of img6, align=center]{Without performing an actual evaluation, we cannot calculate $\iter{I}(\conf)$}; - - \node<+> (img7) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ei/ei_7.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img7, align=center]{Use $P(\conf) = \normaldist (\conf; \iter{\mean}, \iter{(\variance)})$ to calculate $\E[\iter{I}(\conf)]$ instead}; - \end{tikzpicture} -\end{figure} - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Cheap Acquisition Functions - EI} -\framesubtitle{Expected Improvement - Choosing a candidate} -\comment{Verify if formulae agree with minimizing the surrogate.} - \begin{itemize}\abovedisplayskip=0pt\belowdisplayskip=-0.5em - \item[] We first define one-step improvement over the current incumbent, as - \smallskip - \[ - \iter{I}(\conf) = \max(0, \cost(\incumbent[\bocount-1]) - \cost(\conf)), \quad\incumbent[\bocount-1]\in\argmin_{\conf'\in\iter[\bocount-1]{\dataset}}\obs[\conf']\in\iter[\bocount-1]{\dataset} - \] - \comment{This is probably a great time to point out, once again, that because I is defined in terms of the actual cost function, we cannot directly compute it.} - \pause - \medskip - \item[] Expected Improvement is then defined as - \begin{align*} - \iter{\acq}_{EI}(\conf) &= \E[\iter{I}(\conf)]\\ - &= \int_{\iter{I}=0}^{\iter{I}=\infty}\iter{I} P(\iter{I})d\iter{I} - \end{align*} - \pause - \medskip - \item[]Since the posterior distribution of the surrogate is a Gaussian, it can be shown that the distribution on $\iter{I}(\conf)$ is also a Gaussian, defined as - \[ - P(\iter{I}) = - \dfrac{1}{\sqrt{2\pi}\iter{\stddev}(\conf)}\exp{-\dfrac{{(\cost(\incumbent[\bocount-1])-\iter{\mean}(\conf)-\iter{I})}^2}{2\iter{\left(\variance\right)}(\conf)} - } - \] - \comment{Maybe emphasize that this is actually how and where the dependence on the actual cost function is replaced with a dependence on the surrogate.} - \end{itemize} -\end{frame} -%----------------------------------------------------------------------- -% \begin{frame}[c]{Computationally Cheap Acquisition Functions - EI} -% \framesubtitle{Expected Improvement - Choosing a candidate} -% \begin{align*} -% \action<+->{\iter{\acq}_{EI}(\conf) &= \int_{\iter{I}=0}^{\iter{I}=\infty}\iter{I} \dfrac{1}{\sqrt{2\pi}\iter{\stddev}(\conf)}\exp{-\dfrac{{(\cost(\incumbent[\bocount-1])-\iter{\mean}(\conf)-\iter{I})}^2}{2\iter{\left(\variance\right)}(\conf)}}d\iter{I}\\} -% \action<+->{&= -% \begin{cases} -% (\cost(\incumbent) - \iter{\mean}(\conf) - \xi)\cdf(Z) + \iter{\stddev}(\conf) \pdf(Z), & \text{if }\iter{\stddev}(\conf) > 0 \\ -% 0 & \text{if }\iter{\stddev}(\conf) = 0 -% \end{cases}\\} -% \action<+->{\intertext{where }Z} \action<.->{&=\dfrac{\cost(\incumbent) - \iter{\mean}(\conf) - \xi}{\iter{\stddev}(\conf)}} -% \action<+->{\Aboxed{\bonextsample \in \argmax_{\conf\in\pcs}(\iter{\acq}_{EI}(\conf))}} -% \end{align*} -% % \comment{Source: Tutorial by Brochu et al.: https://arxiv.org/pdf/1012.2599.pdf } -% \end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Cheap Acquisition Functions - EI} -\framesubtitle{Expected Improvement - Choosing a candidate} - \begin{align*} - \action<+->{\iter{\acq}_{EI}(\conf) &= \int_{\iter{I}=0}^{\iter{I}=\infty}\iter{I} \dfrac{1}{\sqrt{2\pi}\iter{\stddev}(\conf)}\exp{-\dfrac{{(\cost(\incumbent[\bocount-1])-\iter{\mean}(\conf)-\iter{I})}^2}{2\iter{\left(\variance\right)}(\conf)}}d\iter{I}\\} - \action<+->{&= - \begin{cases} - \iter{\stddev}(\conf)[Z\cdf(Z) + \pdf(Z)], & \text{if }\iter{\stddev}(\conf) > 0 \\ - 0 & \text{if }\iter{\stddev}(\conf) = 0 - \end{cases}\\ - \intertext{where }Z &=\dfrac{\cost(\incumbent[\bocount-1]) - \iter{\mean}(\conf) - \xi}{\iter{\stddev}(\conf)}} - \comment{I believe I needed to switch the signs of $\cost(\cdot)$ and $\mean(\cdot)$ as compared to the reference paper in order to accommodate for our convention of minimization/maximization. Please cross-check!} - \action<+->{\Aboxed{\text{Choose}\,\bonextsample \in \argmax_{\conf\in\pcs}(\iter{\acq}_{EI}(\conf))}} - \end{align*} -% \comment{Source: Tutorial by Brochu et al.: https://arxiv.org/pdf/1012.2599.pdf } -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[t]{Computationally Cheap Acquisition Functions - LCB/UCB} -\framesubtitle{Confidence Bounds - Concept} - -\begin{figure} - \centering - \begin{tikzpicture} - \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lcb/lcb_1.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Confidence Bound, $\mean(\conf)\pm\alpha\stddev(\conf)$}; - \node<+> (img2) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lcb/lcb_2.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img2, align=center]{Lower Confidence Bound, $\mean(\conf)-\alpha\stddev(\conf)$.}; - \node<+> (img3) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lcb/lcb_3.pdf}}; - \node<.> [below=-1.\belowcaptionskip of img3, align=center]{Pay attention that we \emph{minimize} costs (top) and \\\emph{maximize} the acquisition function (bottom)}; - \node<+> (img4) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lcb/lcb_4.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img4, align=center]{We choose $\bonextsample=\argmax(LCB(\conf))$}; - \end{tikzpicture} -\end{figure} - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Cheap Acquisition Functions - LCB/UCB} -\framesubtitle{Confidence Bounds - Choosing a candidate} -\begin{itemize} - \item<+->{We define the Lower Confidence Bound as - \[\iter{\acq}_{LCB}(\conf) = \iter{\mean}(\conf) - \alpha\iter{\stddev}(\conf),\quad\alpha\geq0\]} - \item<+->{It has been shown that using the acquisition function - \[\iter{\acq}_{GP-LCB}(\conf) = \iter{\mean}(\conf) - \sqrt{\nu\tau_t}\iter{\stddev}(\conf), \quad\nu>0,\] asymptotically results in zero cumulative regret with the appropriate choice of parameters $\tau$ and $\nu$ \lit{\href{https://arxiv.org/pdf/0912.3995.pdf}{Srinivas et al. 2009}}.} - \comment{Trying to further explain the difference between LCB and GP-LCB's parameters would've overwhelmed the intuitiveness of the slide. Instead, a quick verbal note on the difference and pointing out the reference paper by Srinivas et al. for further reading should suffice.} - %\comment{Source: Tutorial by Brochu et al.: https://arxiv.org/pdf/1012.2599.pdf } -\end{itemize} -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[t]{Computationally Cheap Acquisition Functions - TS} -\framesubtitle{Thompson Sampling - Concept} - -\begin{figure} - \centering - \begin{tikzpicture} - \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ts/ts_1.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Given the GP at iteration $\bocount$ fit on dataset $\iter[\bocount-1]{\dataset}$}; - \node<+> (img2) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ts/ts_2.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img2, align=center]{Draw a sample $g$ from the GP}; - \node<+> (img3) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ts/ts_3.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img3, align=center]{Then choose the minimum of this sample to evaluate at next}; - % \node<+> (img4) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/ts/ts_4.pdf}}; - % \node<.> [below=0.01\belowcaptionskip of img4, align=center]{Then choose the minimum of this sample}; - \end{tikzpicture} -\end{figure} - -\end{frame} -%----------------------------------------------------------------------- -% \begin{frame}[c]{Computationally Cheap Acquisition Functions - TS} -% \framesubtitle{Thompson Sampling - Gist} -% -% \begin{itemize} -% \item Draw a sample $g$ from the GP $\iter{\gp}$. -% \item Choose $\bonextsample=\argmin_{\conf\in\pcs}(g(\conf))$ -% \end{itemize} -% \end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Cheap Acquisition Functions - TS} -\framesubtitle{Thompson Sampling - Choosing a candidate} -\begin{center} -\begin{minipage}{0.75\textwidth} -\comment{Fix algorithm numbering} -\begin{algorithm}[H] - %\DontPrintSemicolon - \LinesNumbered - \SetAlgoLined - \setcounter{AlgoLine}{0} - \SetKwInOut{Require}{Require} - \SetKwInOut{Result}{Result} - - \Require{Search space $\pcs$, - cost function $\cost$, - Gaussian process $\gp$, - maximal number of function evaluations $\bobudget$} -\Result{Best observed configuration $\finconf$ according to $\iter[\bobudget]{\dataset}$ or $\gp$} - $\iter[0]{\dataset}\leftarrow\varnothing$\; - - \For{$\bocount=1$ \KwTo $\bobudget$}{ - - $\iter[\bocount]{\gp}$ $\leftarrow$ fit \textcolor{blue}{Gaussian process} on $\iter[\bocount-1]{\dataset}$\; - - \textcolor{blue}{Sample $g\sim\iter{\gp}$}\; - - \textcolor{blue}{$\bonextsample\leftarrow \bonextsample \in \argmin_{\conf\in\pcs}g(\conf)$}\; - - Query $\bonextobs$\; - - $\iter[\bocount]{\dataset} \leftarrow \iter[\bocount-1]{\dataset} \cup \{\langle \bonextsample, \bonextobs \rangle \}$\; - } - \caption{Bayesian Optimization using Thompson Sampling} -\end{algorithm} -\end{minipage} -\end{center} -%\comment{Source: Paper, Kandasamy et al, http://proceedings.mlr.press/v84/kandasamy18a/kandasamy18a.pdf} -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} - -\begin{itemize} -%PI - \item \emph{Discussion.} How would you set the exploration parameter $\xi$ for PI in practice? -%EI - \item \emph{Derive.} Starting from the improvement over the current incumbent, derive the closed form solution of expected improvement. - \item \emph{Discussion.} In which situations would EI perform substantially different than PI? -%LCB -%TS -\end{itemize} - -\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t03_acq_functions_2.pdf b/w06_hpo_bo/t03_acq_functions_2.pdf new file mode 100644 index 0000000..e8cb3df Binary files /dev/null and b/w06_hpo_bo/t03_acq_functions_2.pdf differ diff --git a/w06_hpo_bo/t03_acq_functions_2.tex b/w06_hpo_bo/t03_acq_functions_2.tex index 1b49f62..207e3ed 100644 --- a/w06_hpo_bo/t03_acq_functions_2.tex +++ b/w06_hpo_bo/t03_acq_functions_2.tex @@ -1,32 +1,36 @@ -\section{Computationally Expensive Acquisition Functions} +\videotitle{Computationally Expensive Acquisition Functions} + %----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Expensive Acquisition Functions} -%\framesubtitle{Knowledge Gradient - Concept} -\framesubtitle{One-Step Look Ahead} +\begin{frame}[c]{A Computationally Expensive Step: One-Step Look Ahead} +\centering +\begin{overlayarea}{\textwidth}{0.8\textheight} + \only<1>{\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lookahead/look_ahead_1.pdf}} + \only<2>{\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lookahead/look_ahead_2.pdf}} + \only<3>{\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lookahead/look_ahead_3.pdf}} + \only<4>{\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lookahead/look_ahead_4.pdf}} +\end{overlayarea} + +\only<1>{Given the surrogate $\iter[\bocount]{\surro}$ fit at iteration $\bocount$} +\only<2>{Imagine that we sample at a random configuration $\conf$} +\only<3>{We would then observe the cost $\cost(\conf)$ at this imaginary configuration $\conf$} +\only<4>{With this hypothetical data point at $\lambda$, we'd have this \alert{1-step lookahead surrogate $\iter[\bocount+1]{\surro}{\!\given_{\!\conf}}(\cdot)$}} + +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Visualization of How Different the Lookahead Surrogate Can Be} -\begin{figure} +% \begin{figure} \centering \begin{tikzpicture} - \node<+> (img1) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_1.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Once more, assume such a surrogate GP $\iter{\gp}(\cdot)$ at time-step $\bocount$.}; - \node<+> (img2) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_1a.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img2, align=center]{Imagine that we sample at a random configuration $\conf$}; - - \node<+> (img3) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_1b.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img3, align=center]{We would then observe the cost $\cost(\conf)$ at this imaginary configuration $\conf$}; - - \node<+> (img4) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_3.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img4, align=center]{Then, $\iter[\bocount+1]{\gp}(\cdot\given {\conf})$ \emph{might} look like this. This is called a "1-step look ahead".}; - - \node<+> (img5) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_3a.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img5, align=center]{A comparison of $\iter{\gp}(\cdot)$ and $\iter[\bocount+1]{\gp}(\cdot\given {\conf})$ for a given $\conf$.}; + \node<+> (img5) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lookahead/look_ahead_5.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img5, align=center]{A comparison of $\iter{\surro}(\cdot)$ and $\iter[\bocount+1]{\surro}{\!\given_{\!\conf}}(\cdot)$ for a given $\conf$.}; - \node<+> (img6) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_3b.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img6, align=center]{A comparison of $\iter{\gp}(\cdot)$ and $\iter[\bocount+1]{\gp}(\cdot\given {\conf})$ for a different $\conf$.}; + \node<+> (img6) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lookahead/look_ahead_6.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img6, align=center]{A comparison of $\iter{\surro}(\cdot)$ and $\iter[\bocount+1]{\surro}{\!\given_{\!\conf}}(\cdot)$ for a given $\conf$.}; - \node<+> (img7) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_3c.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img7, align=center]{A comparison of $\iter{\gp}(\cdot)$ and $\iter[\bocount+1]{\gp}(\cdot\given {\conf})$ for yet another $\conf$}; + \node<+> (img7) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/lookahead/look_ahead_7.pdf}}; + \node<.> [below=0.01\belowcaptionskip of img7, align=center]{A comparison of $\iter{\surro}(\cdot)$ and $\iter[\bocount+1]{\surro}{\!\given_{\!\conf}}(\cdot)$ for a given $\conf$.}; \comment{Pardon the inconsistent label for the vertical red line - it's a bug that would need some time to trace and solve, but should not affect the readability of the plots.} @@ -36,80 +40,80 @@ \section{Computationally Expensive Acquisition Functions} \comment{This distribution is purely hypothetical - as shown by the conditional - and is called a one-step look-ahead. Just a re-statement of the GP's conditional nature at $\bocount+1$. Since we don't actually have the underlying objective function available in real-life scenarios, it is impossible to generate the true look-ahead without actually performing an evaluation.} \end{tikzpicture} -\end{figure} +% \end{figure} \end{frame} + %----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Expensive Acquisition Functions - KG} -\framesubtitle{Knowledge Gradient - Concept} -\begin{figure} - \centering - \begin{tikzpicture} - \node<+> (img1) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_1.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img1, align=center]{Once more, assume such a surrogate function GP $\iter{\gp}(\cdot)$ at time-step $\bocount$.}; - - \node<+> (img2) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_KG_2.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img2, align=center]{Given that we are risk-neutral, the configuration corresponding to the minimum \\of the mean function, $\iter{\left(\mean^*\right)}$, is the best choice here.}; - - \node<+> (img3) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_3.pdf}}; - \node<.> [below=0.01\belowcaptionskip of img3, align=center]{If we perform a one-step look-ahead, we would get $\iter[\bocount+1]{\gp}(\cdot\given {\conf})$ for some configuration $\conf$.}; - - \node<+> (img4) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_KG_4.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img4, align=center]{The best risk-neutral choice is again given by the minimum \\of the new conditional mean function - $\iter[\bocount+1]{\left(\mean^*\right)} \given_{\conf}$.}; - - \node<+> (img5) {\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/look_ahead_KG_5.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img5, align=center]{The actual improvement in cost - from $\iter{\left(\mean^*\right)}$ to $\iter[\bocount+1]{\left(\mean^*\right)}$ - cannot be computed \\without evaluation. We compute its expected value i.e. the \emph{Knowledge Gradient}.}; - \end{tikzpicture} -\end{figure} +\begin{frame}[c]{Knowledge Gradient (KG): Concept} + +% \begin{figure} +\centering +\begin{overlayarea}{\textwidth}{0.8\textheight} + \only<1>{\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/kg_1.pdf}} + \only<2>{\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/kg_2.pdf}} + \only<3>{\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/kg_3.pdf}} + \only<4>{\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/kg_4.pdf}} + \only<5>{\includegraphics[width=\textwidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/kg/kg_5.pdf}} +\end{overlayarea} +\only<1>{Given the surrogate $\surro(\conf) = \normaldist( \mean(\conf), \variance(\conf))$ +fit at iteration $\bocount$} + +\only<2>{If we are risk-neutral, we'd return $\arg\min_{\conf} \iter{\left(\mean(\conf)\right)}$ as incumbent, with \alert{value $\iter[\bocount]{\left(\mean^*\right)}$}} + +\only<3>{If we perform a one-step look-ahead for configuration $\conf$, we would get $\iter[\bocount+1]{\surro}{\!\given_{\!\conf}}$} + +\only<4>{We would then be interested in the \alert{minimum of the updated mean function $\iter[\bocount+1]{\left(\mean^*\right)} \given_{\conf}$}} +% The best risk-neutral choice would then the minimum of the new conditional mean function: $\iter[\bocount+1]{\left(\mean^*\right)} \given_{\conf}$.}; + +\only<5>{The \alert{Knowledge Gradient} is then the \alert{expectation of the improvement $\iter[\bocount+1]{\left(\mean^*\right)} - \iter[\bocount+1]{\left(\mean^*\right)} \!\given_{\!\conf}$}} +% \end{figure} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Expensive Acquisition Functions - KG} -\framesubtitle{Knowledge Gradient - Choosing a candidate} -\begin{itemize}\belowdisplayskip=1.5em - \item<+-> Given a GP $\iter{\gp}$ fit on $\iter[\bocount-1]{\dataset}$ on the $\bocount\,$th iteration, we have - \[ - \iter{\left(\mean^*\right)} = \min_{\conf'\in\pcs}\,\iter{\mean}\left(\conf'\big|\iter[\bocount-1]{\dataset}\right) - \] - - \item<+-> If we choose a candidate $\bonextsample=\conf$ to evaluate $\cost(\cdot)$ at, - \[ - \left.\iter{\dataset}\right|_{\conf} = \iter[\bocount-1]{\dataset}\cup\left\{\left\langle\bonextsample,\,\bonextobs\right\rangle\big|\bonextsample=\conf\right\} - \] - - \item <+-> Thus, if we hypothesize about the $\bocount+1\,$th iteration, we would get - \[ - \begin{split} - \left.\iter[\bocount+1]{\left(\mean^*\right)} \right|_{\conf} - &= \min_{\conf'\in\pcs} \iter[\bocount+1]{\mean} \left(\conf'\big| \iter{\dataset},\bonextsample=\conf \right)\\ - %&= \min_{\conf'\in\pcs} \iter[\bocount+1]{\mean} \left(\conf'\big| \iter[\bocount-1]{\dataset},\conf,\obs \right)\\ - \end{split} - \] -\end{itemize} -\comment{Source:https://arxiv.org/pdf/1807.02811.pdf} -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Expensive Acquisition Functions - KG} -\framesubtitle{Knowledge Gradient - Choosing a candidate} -\begin{itemize}\belowdisplayskip=1.5em - \item<+->In a risk-neutral setting, $\iter{\left(\mean^*\right)}$ and $\left.\iter[\bocount+1]{\left(\mean^*\right)}\right|_{\conf}$ are the global optima for $\iter{\mean}$ and $\left.\iter[\bocount+1]{\mean}\right|_{\conf}$ respectively. - - \item<+-> Thus, the conditional improvement in the cost is - \[ - \left.\iter{I}\right|_{\conf}=\iter{\left(\mean^*\right)} - \left.\iter[\bocount+1]{\left(\mean^*\right)} \right|_{\bonextsample=\conf} - \] - - \item<+-> We cannot directly compute this improvement without performing an evaluation, but we can compute its expected value, which we call \emph{Knowledge Gradient}. - \lit{\href{https://arxiv.org/pdf/1807.02811.pdf}{Frazier 2018}} -\end{itemize} -\end{frame} +%\begin{frame}[c]{Knowledge Gradient (KG): Formal Definition (1/3)} +%\begin{itemize}\belowdisplayskip=1.5em +% \item<+-> Given a GP $\iter{\gp}$ fit on $\iter[\bocount-1]{\dataset}$ on the $\bocount\,$th iteration, we have +% \[ +% \iter{\left(\mean^*\right)} = \min_{\conf'\in\pcs}\,\iter{\mean}\left(\conf'\big|\iter[\bocount-1]{\dataset}\right) +% \] +% +% \item<+-> If we choose a candidate $\bonextsample=\conf$ to evaluate $\cost(\cdot)$ at, +% \[ +% \left.\iter{\dataset}\right|_{\conf} = \iter[\bocount-1]{\dataset}\cup\left\{\left\langle\bonextsample,\,\bonextobs\right\rangle\big|\bonextsample=\conf\right\} +% \] +% +% \item <+-> Thus, if we hypothesize about the $\bocount+1\,$th iteration, we would get +% \[ +% \begin{split} +% \left.\iter[\bocount+1]{\left(\mean^*\right)} \right|_{\conf} +% &= \min_{\conf'\in\pcs} \iter[\bocount+1]{\mean} \left(\conf'\big| \iter{\dataset},\bonextsample=\conf \right)\\ +% %&= \min_{\conf'\in\pcs} \iter[\bocount+1]{\mean} \left(\conf'\big| \iter[\bocount-1]{\dataset},\conf,\obs \right)\\ +% \end{split} +% \] +%\end{itemize} +%\comment{Source:https://arxiv.org/pdf/1807.02811.pdf} +%\end{frame} +%%----------------------------------------------------------------------- +%\begin{frame}[c]{Knowledge Gradient (KG): Formal Definition (2/3)} +%\begin{itemize}\belowdisplayskip=1.5em +% \item<+->In a risk-neutral setting, $\iter{\left(\mean^*\right)}$ and $\left.\iter[\bocount+1]{\left(\mean^*\right)}\right|_{\conf}$ are the global optima for $\iter{\mean}$ and $\left.\iter[\bocount+1]{\mean}\right|_{\conf}$ respectively. +% +% \item<+-> Thus, the conditional improvement in the cost is +% \[ +% \left.\iter{I}\right|_{\conf}=\iter{\left(\mean^*\right)} - \left.\iter[\bocount+1]{\left(\mean^*\right)} \right|_{\bonextsample=\conf} +% \] +% +% \item<+-> We cannot directly compute this improvement without performing an evaluation, but we can compute its expected value, which we call \emph{Knowledge Gradient}. +% \lit{\href{https://arxiv.org/pdf/1807.02811.pdf}{Frazier 2018}} +%\end{itemize} +%\end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Expensive Acquisition Functions - KG} -\framesubtitle{Knowledge Gradient - Choosing a candidate} +\begin{frame}[c]{Knowledge Gradient (KG): Formal Definition} \begin{itemize}\belowdisplayskip=1.5em - \item<+-> Thus, + \item The Knowledge Gradient is the \alert{expectation of the improvement $\iter[\bocount+1]{\left(\mean^*\right)} - \iter[\bocount+1]{\left(\mean^*\right)} \!\given_{\!\conf}$}: \[ \begin{split} \iter{\acq}_{KG}(\conf) @@ -120,64 +124,163 @@ \section{Computationally Expensive Acquisition Functions} \left[\min_{\conf'\in\pcs}\iter[\bocount+1]{\mean}\left(\conf'\given{\iter[\bocount-1]{\dataset} \cup \left\{\left\langle\conf, \tilde{\cost} \right\rangle \right\}}\right)\right] \end{split} \] - - \item<+-> Finally, - \[ - \text{Choose}\,\boxed{\bonextsample = \argmax_{\conf\in\pcs}(\iter{\acq}_{KG}(\conf))} +%\bigskip +%\pause +% \item Approximating the expectation is the expensive step; for an efficient approach, see \lit{\href{https://arxiv.org/pdf/1807.02811.pdf}{Frazier 2018}} +\pause +\bigskip +\[ + \boxed{\text{Choose}\;\;\bonextsample = \argmax_{\conf\in\pcs}(\iter{\acq}_{KG}(\conf))} \] \comment{\lit{\href{https://arxiv.org/pdf/1807.02811.pdf}{Frazier 2018}}} \end{itemize} \end{frame} + + + +\begin{frame}[c]{Knowledge Gradient: Pseudocode for Monte Carlo Approximation} + +\[\iter{\acq}_{KG}(\conf) = const - \alert{ +\E_{\tilde{\cost} \sim \iter[\bocount]{\surro(\lambda)}} + %{p\left(\obs\big|\iter[\bocount-1]{\dataset},\conf\right)} + \left[\min_{\conf'\in\pcs}\iter[\bocount+1]{\mean}\left(\conf'\given{\iter[\bocount-1]{\dataset} \cup \left\{\left\langle\conf, \tilde{\cost} \right\rangle \right\}}\right)\right] +}\] + +\begin{center} +\begin{minipage}{0.75\textwidth} +\comment{Fix algorithm numbering} +\begin{algorithm}[H] + %\DontPrintSemicolon + \LinesNumbered +% \SetAlgoLined + \setcounter{AlgoLine}{0} + \SetKwInOut{Require}{Require} + \SetKwInOut{Result}{Result} + + \Require{Surrogate $\surro$, candidate configuration $\conf$, dataset $\dataset$} + \Result{Utility $\acq(\conf)$} + + \For{$s=1$ \KwTo $S$}{ + Sample $\tilde{c}_s \sim \surro(\conf)$\; + + Update $\surro$ with $\{\left\langle\conf, \tilde{c}_s\right\rangle\}$ to yield $\surro_s = \normaldist( \mean_s, \variance_s)$\; + + $e[s]\leftarrow \min_{\conf'\in\pcs}{\mean_s}$ + } + + $\acq\leftarrow const - \frac{1}{S} \sum_{s=1}^S e[s]$\; + + \caption*{Sampling Based Knowledge Gradient Acquisition Function} +\end{algorithm} +\end{minipage} + +\pause +\bigskip +This sampling view is useful for intuition;\\ but in practice, there are more efficient ways to optimize KG \lit{\href{https://arxiv.org/pdf/1807.02811.pdf}{Frazier 2018}} + +\end{center} + +\end{frame} + %----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Expensive Acquisition Functions - ES} -\framesubtitle{Entropy Search - Choosing a candidate} +\begin{frame}[c]{Entropy Search Preliminaries} \begin{itemize} - \item<+-> Idea: Evaluate $\conf$ which reduces our uncertainty about the location of $\optconf$. - \item<+-> Do this by minimizing the entropy of the distribution of the lowation of the minimum: $\mathcal{H}(p_{min})$ - \item<+-> $p_{min}(\conf^*|\dataset) = p(\conf^* \in \argmin_{\conf' \in \pcs} (\surro(\conf') | \dataset))$ - \item<+-> We define the acquisition function as - \[ - \begin{split} - \iter{\acq}_{ES}(\conf,\dataset) = \mathcal{H}\left(p_{min}(\conf^*)|\dataset \right) - \E_{\tilde{\cost} \sim \iter[\bocount]{\surro(\conf)}}\left[\mathcal{H} \left( p_{min}(\conf^*)|\dataset \cup \left\{ \left\langle \conf, \tilde{\cost} \right\rangle \right\} \right)\right] - \end{split} - \] - \item<+-> Entropy search contains a similar look-ahead as knoledge gradient - \item<+-> And as always, - \[ - \text{Choose}\,\boxed{\bonextsample = \argmax_{\conf\in\pcs}(\iter{\acq}_{ES}(\conf))} - \] + \item Key idea: Evaluate $\conf$ which most \alert{reduces our uncertainty about the location of $\optconf$} +\bigskip +\pause + \item We'll use the $p_{min}$ distribution to characterize the location of $\optconf$: \alert{\[p_{min}(\conf^*|\dataset) = p(\conf^* \in \argmin_{\conf' \in \pcs} (\surro(\conf') | \dataset))\]} +\medskip +\pause + \item Our uncertainty is then captured by the \alert{entropy $H( p_{min}(\cdot |\dataset))$ of the $p_{min}$ distribution} +\pause +\bigskip + \item Minimizing $H( p_{min}(\cdot |\dataset))$ yields a peaked $p_{min}$ distribution, i.e., strong knowledge about the location of $\optconf$ + +% Do this by minimizing the entropy of the distribution of the lowation of the minimum: $\mathcal{H}(p_{min})$ +% \item<+-> $p_{min}(\conf^*|\dataset) = p(\conf^* \in \argmin_{\conf' \in \pcs} (\surro(\conf') | \dataset))$ +% \item<+-> We define the acquisition function as +% \[ +% \begin{split} +% \iter{\acq}_{ES}(\conf,\dataset) = \mathcal{H}\left(p_{min}(\conf^*)|\dataset \right) - \E_{\tilde{\cost} \sim \iter[\bocount]{\surro(\conf)}}\left[\mathcal{H} \left( p_{min}(\conf^*)|\dataset \cup \left\{ \left\langle \conf, \tilde{\cost} \right\rangle \right\} \right)\right] +% \end{split} +% \] +% \item<+-> Entropy search contains a similar look-ahead as knoledge gradient +% \item<+-> And as always, +% \[ +% \text{Choose}\,\boxed{\bonextsample = \argmax_{\conf\in\pcs}(\iter{\acq}_{ES}(\conf))} +% \] \end{itemize} -\comment{Source:\lit{\href{Frazier 2018}{https://arxiv.org/pdf/1807.02811.pdf}}} + \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Expensive Acquisition Functions - ES} -\framesubtitle{Entropy Search - Concept} +\begin{frame}[c]{Entropy Search: Visualization of the $p_{min}$ Distribution} -\begin{figure} +% \begin{figure} \centering \begin{tikzpicture} - \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/es/es_1.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img1, align=center]{We consider the global optimum's position to be a random variable $\conf^*$, \\ so each configuration $\conf$ can be assigned a probability $p_{min}=P(\conf=\conf^*)$.}; +% \node<+> (img1) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/es/es_1.pdf}}; +% \node<.> [below=-1.0\belowcaptionskip of img1, align=center]{ +% %We consider the global optimum's position to be a random variable $\conf^*$, \\ so each configuration $\conf$ can be assigned a probability $p_{min}=P(\conf=\conf^*)$.}; +% Initially, the $p_{min}$ distribution is uniform}; \node<+> (img2) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/es/es_2.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img2, align=center]{The minimizing configuration $\hat{\conf}$ of a sample from the GP $\iter{\gp}$ \\provides some evidence for where $\conf^*$ may lie.}; + \node<.> [below=-1.0\belowcaptionskip of img2, align=center]{For each sample drawn from $\surro$, we can compute where $\conf^*$ lies}; \node<+> (img3) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/es/es_3.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img3, align=center]{Each new sample provides more information about where the global minimum lies - \\i.e. has an associated \emph{information gain}.}; + \node<.> [below=-1.0\belowcaptionskip of img3, align=center]{For each sample drawn from $\surro$, we can compute where $\conf^*$ lies}; \node<+> (img4) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/es/es_4.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img4, align=center]{After many such samples, \\we can narrow down the approximate location of the global minimum.}; + \node<.> [below=-1.0\belowcaptionskip of img4, align=center]{From many samples we can approximate the $p_{min}$ distribution}; \node<+> (img5) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/es/es_5.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img5, align=center]{An approximate probability distribution for $p_{min}$ can now be generated.}; - \comment{Since the aim was to illustrate a concept, the underlying code for these plots used simplified max-value entropy search, and the final PDF was generated using Kernel Density Estimation, thus leading to an artifact at the right edge where a very high bin value was "lost".} + \node<.> [below=-1.0\belowcaptionskip of img5, align=center]{From many samples we can approximate the $p_{min}$ distribution}; +% \comment{Since the aim was to illustrate a concept, the underlying code for these plots used simplified max-value entropy search, and the final PDF was generated using Kernel Density Estimation, thus leading to an artifact at the right edge where a very high bin value was "lost". +% } + +% \node<+> (img6) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/es/es_6.pdf}}; +% \node<.> [below=-1.0\belowcaptionskip of img6, align=center]{ + %FH: this is still WRONG, I commented out the slide! %The configuration that is most likely to be $\conf^*$ provides the greatest information gain,\\ or in other words, would reduce the entropy of the search space the most when evaluated.}; - \node<+> (img6) {\includegraphics[width=\linewidth, height=0.7\textheight, keepaspectratio=true]{images/acq_func_images/es/es_6.pdf}}; - \node<.> [below=-1.0\belowcaptionskip of img6, align=center]{The configuration that is most likely to be $\conf^*$ provides the greatest information gain,\\ or in other words, would reduce the entropy of the search space the most when evaluated.}; \end{tikzpicture} -\end{figure} +% \end{figure} + +% \notefh{The figures should not jump around the slide in a PDF animation.} + +\end{frame} + +%----------------------------------------------------------------------- +\begin{frame}[c]{Entropy Search: Formal Definition} +\begin{itemize} + + \item The $p_{min}$ distribution characterizes the location of $\optconf$: \alert{\[p_{min}(\conf^*|\dataset) = p(\conf^* \in \argmin_{\conf' \in \pcs} (\surro(\conf') | \dataset))\]} +\smallskip +\pause + \item Our uncertainty about the location of $\optconf$ is captured by the \alert{entropy $H( p_{min}(\cdot |\dataset))$ of the $p_{min}$ distribution} +\bigskip +\pause +\item \alert{Entropy search aims to minimize $H(p_{min})$}, to yield a peaked $p_{min}$ distribution: + %, i.e., strong knowledge about the location of $\optconf$: + + \alert{\[u_{ES}(\conf) = H( p_{min}(\cdot |\dataset)) - \E_{\tilde{\cost} \sim \iter[\bocount]{\surro(\lambda)}} H( p_{min}(\cdot |\dataset \cup \left\{\left\langle\conf, \tilde{\cost} \right\rangle \right\})) \]} + +% Do this by minimizing the entropy of the distribution of the lowation of the minimum: $\mathcal{H}(p_{min})$ +% \item<+-> $p_{min}(\conf^*|\dataset) = p(\conf^* \in \argmin_{\conf' \in \pcs} (\surro(\conf') | \dataset))$ +% \item<+-> We define the acquisition function as +% \[ +% \begin{split} +% \iter{\acq}_{ES}(\conf,\dataset) = \mathcal{H}\left(p_{min}(\conf^*)|\dataset \right) - \E_{\tilde{\cost} \sim \iter[\bocount]{\surro(\conf)}}\left[\mathcal{H} \left( p_{min}(\conf^*)|\dataset \cup \left\{ \left\langle \conf, \tilde{\cost} \right\rangle \right\} \right)\right] +% \end{split} +% \] +% \item<+-> Entropy search contains a similar look-ahead as knoledge gradient +% \item<+-> And as always, + \[ + \boxed{\text{Choose}\;\;\bonextsample = \argmax_{\conf\in\pcs}(\iter{\acq}_{ES}(\conf))} + \] +\end{itemize} \end{frame} +%----------------------------------------------------------------------- + % %---------------------------------------------------------------------- % \begin{frame}[c]{Computationally Expensive Acquisition Functions - ES} % \framesubtitle{Entropy Search - Pseudocode for Sampling Version} @@ -216,64 +319,80 @@ \section{Computationally Expensive Acquisition Functions} % \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Computationally Expensive Acquisition Functions - ES} -\framesubtitle{Entropy Search - Pseudocode for Sampling Version} +\begin{frame}[c]{Entropy Search: Pseudocode for Monte Carlo Approximation} + +\vspace*{-0.2cm} +\[u_{ES}(\conf) = const - \alert{\E_{\tilde{\cost} \sim \iter[\bocount]{\surro(\lambda)}} H( p_{min}(\cdot |\dataset \cup \left\{\left\langle\conf, \tilde{\cost} \right\rangle \right\}))} \] +\vspace*{-0.5cm} \begin{center} -\begin{minipage}{0.75\textwidth} +\begin{minipage}{0.9\textwidth} \comment{Fix algorithm numbering} \begin{algorithm}[H] +\footnotesize{} %\DontPrintSemicolon \LinesNumbered - \SetAlgoLined +% \SetAlgoLined \setcounter{AlgoLine}{0} \SetKwInOut{Require}{Require} \SetKwInOut{Result}{Result} - \Require{Gaussian process $\gp$, candidate configuration $\conf$, finite set of representer points $\pcs_{repr}$, dataset $\dataset$} + \Require{Surrogate $\surro$, candidate configuration $\conf$, finite set of representer points $\pcs_{r}$, dataset $\dataset$} \Result{Utility $\acq(\conf)$} - $F\leftarrow{\langle F_{\conf} = 0 \rangle}_{\conf'\in\pcs_{repr}}$ - Frequencies to compute $p_{min}$\; \For{$s=1$ \KwTo $S$}{ - - %Sample $\tilde{c}_s \sim \normaldist(\conf | \mu_{\iter{\gp}}, \sigma_{\iter{\gp}})$\; - Sample $\tilde{c}_s \sim \gp$ at $\conf$\; - - $\gp_s \leftarrow{}$ Condition $\gp$ on $\dataset \cup \{\left\langle\conf, \tilde{c}_s\right\rangle\}$\; - - $\conf_s\leftarrow\argmin_{\conf' \in \conf_{cand}}(\gp_s)$\; - - $F[\conf_s]\leftarrow F[\conf_s] + 1$\;} + %Sample $\tilde{c}_s \sim \normaldist(\conf | \mu_{\iter{\gp}}, \sigma_{\iter{\gp}})$\; + Sample $\tilde{c}_s \sim \surro(\conf)$; \; + $\surro_s \leftarrow $ Update $\surro$ with + $\{\left\langle\conf, \tilde{c}_s\right\rangle\}$\; - $p_{min}(\conf') \leftarrow F_{\conf'} / \sum_{\conf'' \in \pcs_{repr}} F_{\conf''} \forall{\conf' \in \pcs_{repr}} $ \; - - $\acq\leftarrow\mathcal{H}(p_{min}) = - \sum_{\conf' \in \pcs_{repr}} p_{min}(\conf') \log p_{min}(\conf')$\; + Initialize $F[\conf]=0$ $\;\; \forall \conf'\in\pcs_{r}$ - \caption{Sampling Based Entropy Search Acquisition Function} + \For{$n=1$ \KwTo $N$} + { + Sample $g_n\sim\surro_s$\; + + $\conf_s\leftarrow\argmin_{\conf' \in \pcs_{r}} g_n$\; + + $F[\conf_s]\leftarrow + F[\conf_s] + 1$\; + } + $p_{min,s}(\conf') \leftarrow F_{\conf'} / N \;\;\; \forall{\conf' \in \pcs_{r}} $ \; + + $H_s \leftarrow H(p_{min,s}) \text{, computed as } - \sum_{\conf' \in \pcs_{r}} p_{min,s}(\conf') \log p_{min,s}(\conf')$\; + } + $\acq\leftarrow const - \frac{1}{S} \sum_{s=1}^{S} H_s$ + + \caption*{Sampling Based Entropy Search Acquisition Function} \end{algorithm} \end{minipage} \end{center} \end{frame} %---------------------------------------------------------------------- -\begin{frame}[c]{Computationally Expensive Acquisition Functions - ES} -\framesubtitle{Entropy Search - Varieties} +\begin{frame}[c]{Entropy Search: Variations} \begin{itemize} %\item<+-> Note how repeated Thompson Sampling inherently approximates sampling based entropy-search! - \item<+-> A faster, but mathematically more involved version can be found in the \emph{original paper} on Entropy Search. \lit{\href{http://jmlr.csail.mit.edu/papers/volume13/hennig12a/hennig12a.pdf}{Hennig et al. 2012}}. - \item<+-> Instead of computing ES directly, an alternative formulation called \emph{Predictive Entropy Search} is often used. \lit{\href{http://papers.nips.cc/paper/5324-predictive-entropy-search-for-efficient-global-optimization-of-black-box-functions.pdf}{Hernández-Lobato et al. 2014}} - \item<+-> There exists a variant called \emph{Max-Value Entropy Search} which is cheaper to compute and has similar behavior. \lit{\href{https://arxiv.org/abs/1703.01968}{Wang and Jegelka 2017}} - \item<+-> Further reading and summary for ES: \lit{\href{https://arxiv.org/pdf/1602.01064.pdf}{Metzen 2016}}. + \item The sample-based approximation is slow; for a faster approximation with expectation propagation see the original ES paper \lit{\href{http://jmlr.csail.mit.edu/papers/volume13/hennig12a/hennig12a.pdf}{Hennig et al. 2012}} +\medskip + \item \alert{Predictive Entropy Search} \lit{\href{http://papers.nips.cc/paper/5324-predictive-entropy-search-for-efficient-global-optimization-of-black-box-functions.pdf}{Hernández-Lobato et al. 2014}} is a frequently-used equivalent formulation that gives rise to more convenient approximations +\medskip + \item \alert{Max-Value Entropy Search} + \lit{\href{https://arxiv.org/abs/1703.01968}{Wang and Jegelka 2017}} is a recent variant that + is cheaper to compute and has similar behavior +\medskip + \item Further reading and summary for ES: \lit{\href{https://arxiv.org/pdf/1602.01064.pdf}{Metzen 2016}} \end{itemize} \end{frame} %----------------------------------------------------------------------- \begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} \begin{itemize} %KG - \item \emph{Repetition.} Describe the differences between KG and EI? + \item \alert{Repetition.} Describe the similarities and differences between KG and EI. +\medskip %ES - \item \emph{Discussion.} Is there an incentive for Entropy search to sample at $\max(p_{min})$? - \item \emph{Discussion.} How would you optimize the acquisition function in practice? + \item \alert{Discussion.} When is there an incentive for entropy search to sample at $\max(p_{min})$? +% \item \emph{Discussion.} How would you optimize the acquisition function in practice? \end{itemize} \end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t04_surrogate_models.pdf b/w06_hpo_bo/t04_surrogate_models.pdf new file mode 100644 index 0000000..8cd519d Binary files /dev/null and b/w06_hpo_bo/t04_surrogate_models.pdf differ diff --git a/w06_hpo_bo/t04_surrogate_models.tex b/w06_hpo_bo/t04_surrogate_models.tex index 84d9c59..655216d 100644 --- a/w06_hpo_bo/t04_surrogate_models.tex +++ b/w06_hpo_bo/t04_surrogate_models.tex @@ -1,99 +1,92 @@ -\section{Surrogate Models} +\videotitle{Surrogate Models} + %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Desiderata} +\myframetop{Desiderata for Surrogate Models in Bayesian Optimization}{ -\begin{columns}[T] % align columns -\begin{column}{.48\textwidth} -\only<1-2>{ - \begin{block}{Mandatory} - \begin{itemize} - \item Regression model - \item Uncertainty estimates - \item Accurate predictions - \end{itemize} - \end{block} -} -\only<2-2>{ - \begin{block}{Depending on the application} - \begin{itemize} - \item Cheap-to-train - \item Scales with the complexity of the data - \note[item]{(number of features and observations)} - \item Can handle different types of inputs - \note[item]{(categorical and continuous)} - \end{itemize} - \end{block} + \begin{columns}[T] % align columns + \begin{column}{.48\textwidth} + \only<1-2>{ + \begin{block}{In all cases} + \begin{itemize} + \item Regression model with uncertainty estimates + \item Accurate predictions + \end{itemize} + \end{block} + } + \only<2-2>{ + \begin{block}{Depending on the application} + \begin{itemize} + \item Is cheap to train + \item Scales well in the number of data points + \item Scales well in the number of dimensions + \item Can handle different types of inputs (categorical and continuous) + \end{itemize} + \end{block} + } + \end{column}% + + \hfill% + + \begin{column}{.48\textwidth} + \bigskip + \bigskip + %\only<1-1>{\includegraphics[width=1.\textwidth]{images/bo_loop_overview/03_mean.png}} + %\only<2-6>{ + \includegraphics[width=\textwidth]{w06_hpo_bo/images/bo_loop_overview/Uncertainty.pdf} + %} + + \end{column}% + \end{columns} } -\end{column}% - -\hfill% - -\begin{column}{.48\textwidth} - -%\only<1-1>{\includegraphics[width=1.\textwidth]{images/bo_loop_overview/03_mean.png}} -%\only<2-6>{ -\includegraphics[width=\textwidth]{w06_hpo_bo/images/bo_loop_overview/Uncertainty.pdf} -%} - -\end{column}% -\end{columns} - -\end{frame} %----------------------------------------------------------------------- %----------------------------------------------------------------------- %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Overview} +\begin{frame}[c]{Overview of the Surrogate Models We'll Discuss} -\begin{columns}[T] % align columns -\begin{column}{.38\textwidth} -\begin{minipage}[c][.6\textheight][c]{\linewidth} +%\begin{columns}[T] % align columns +%\begin{column}{.38\textwidth} +%\begin{minipage}[c][.6\textheight][c]{\linewidth} \begin{itemize} \item Gaussian Processes \note[item]{(quite common)} \item Random Forests \note[item]{(our default choice)} \item Bayesian Neural Networks \note[item]{(recent trend)} \end{itemize} -\end{minipage} -\end{column}% - -\hfill% - -\begin{column}{.58\textwidth} - -\begin{columns}[T] % align columns -\begin{column}{.48\textwidth} - \includegraphics[width=1.\textwidth]{images/surrogate_models/uncertainty_gp.jpg} -\end{column}% - -\hfill% - -\begin{column}{.48\textwidth} - \includegraphics[width=1.\textwidth]{images/surrogate_models/uncertainty_forest.jpg} -\end{column}% -\end{columns} - -\vspace*{\fill} -\begin{center} - \includegraphics[width=.6\textwidth]{images/surrogate_models/uncertainty_dngo.jpg} -\end{center} -\vspace*{\fill} - -\end{column}% -\end{columns} +%\end{minipage} +%\end{column}% -\source{A. Klein: Introduction Automated Machine Learning} +%\hfill% +%\begin{column}{.58\textwidth} +% +%\begin{columns}[T] % align columns +%\begin{column}{.48\textwidth} +% \includegraphics[width=1.\textwidth]{images/surrogate_models/uncertainty_gp.jpg} +%\end{column}% +% +%\hfill% +% +%\begin{column}{.48\textwidth} +% \includegraphics[width=1.\textwidth]{images/surrogate_models/uncertainty_forest.jpg} +%\end{column}% +%\end{columns} +% +%\vspace*{\fill} +%\begin{center} +% \includegraphics[width=.6\textwidth]{images/surrogate_models/uncertainty_dngo.jpg} +% +%\end{center} +%\vspace*{\fill} +% +%\end{column}% +%\end{columns} +% +%\hspace{5.5cm}\footnotesize{Image source: \lit{\href{}{A. Klein: Introduction Automated Machine Learning}}} -%% TODO -\comment{adjust those plots -FH: best to have plots stay on the slide, e.g., next to each other.} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Gaussian Processes - Reminder} +\begin{frame}[c]{Gaussian Processes (GPs): Reminder of Pros and Cons} \begin{columns}[T] % align columns \begin{column}{.48\textwidth} @@ -101,10 +94,16 @@ \section{Surrogate Models} \begin{block}{Advantages} \begin{itemize} \item Smooth and reliable uncertainty estimates - \item Sample efficiency + \item Strong sample efficiency \item We can encode expert knowledge about the design space in the kernel \end{itemize} \end{block} +\bigskip +\pause +\hspace*{0.5cm}These advantages make GPs the\\ +\hspace*{0.5cm}\alert{most commonly-used model\\ +\hspace*{0.5cm}in Bayesian optimization} + \end{column}% \hfill% @@ -113,9 +112,10 @@ \section{Surrogate Models} \begin{column}{.48\textwidth} \begin{block}{Disadvantages} \begin{itemize} - \item We have to define a good kernel for each application + \item Performance can be quite sensitive to the choice of kernel \note[item]{(if we don't optimize small-dimensional, continuous functions)} \item Cost scales cubically with the number of observations + \item Weak performance for high dimensionality \note[item]{(because of inverting the kernel)} \item Not easily applicable in discrete or conditional spaces \item Sensitive to its own hyperparameters @@ -185,48 +185,50 @@ \section{Surrogate Models} %----------------------------------------------------------------------- %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Gaussian Processes - Kernel Hyperparameters} - -\begin{itemize} - \item We can use \emph{Maximum A Posteriori} (MAP) or \emph{Maximum Likelihood Estimation} (MLE) to optimize hyperparameters of the Gaussian process. - \item However, it is not realistic to assume that the hyperparameters distribution can be represented by a single point estimate -\end{itemize} - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Gaussian Processes - Kernel Hyperparameters} +\begin{frame}[c]{Gaussian Processes (GPs): Kernel Hyperparameters} \begin{columns}[T] % align columns \begin{column}{.6\textwidth} \begin{itemize} - \item<+-> \emph{Markov-Chain Monte-Carlo} (MCMC) samples hyperparameters from the posterior distribution - \item<+-> \emph{Marginalize} over hyperparameters and compute an \emph{integrated acquisition function}: + \item We could optimize GP hyperparameters (maximum likelihood, MLE, or maximum a posteriori, MAP) + + + \item<+-> But \alert{sampling} GP hyperparameters from the posterior distribution performs better; e.g., via \alert{Markov-Chain Monte-Carlo (MCMC)} + + + \item<+-> \alert{Marginalize} over GP hyperparameters $\theta$ and compute an \alert{integrated acquisition function}: \begin{equation*} \begin{aligned} \Bar{\acq}(\conf) = \int \acq (\conf, \surro_\theta)p(\theta)d\theta \end{aligned} \end{equation*} - \item<+-> But, MCMC is computationally more expensive since the acquisition function now needs to be calculated more than once. + + + \item<+-> Downside: computational expense + \myit{ + \item MCMC is computationally expensive + \item Acquisition function now has to be calculated for each sample + } \end{itemize} \end{column} % \begin{column}{.4\textwidth} \only<2->{ -\begin{figure} \centering \includegraphics[width=0.7\textwidth]{images/surrogate_models/kernel_hp_mcmc.jpg} -\end{figure}} + + \footnotesize{Image source: \lit{\href{https://arxiv.org/pdf/1502.05700.pdf}{Snoek et al. 2015}}} +} + + + \end{column} \end{columns} -\source{Snoek et al. 2015} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Random Forests} +\begin{frame}[c]{Random Forests (RFs): Reminder \& How To Compute Uncertainties} \centering \includegraphics[width=0.5\textwidth]{images/surrogate_models/random_forest_pic} @@ -234,11 +236,11 @@ \section{Surrogate Models} \begin{columns}[T] % align columns \begin{column}{.48\textwidth} -\begin{block}{Train} +\begin{block}{RF Training} \begin{itemize} - \item $n$ randomized regression trees - \item Subsampled training data for each tree (with bootstrapping) - \item Each tree gives us a possible explanation for the observations + \item Fit a set of \alert{randomized} regression trees + \item Randomization via bootstrapping \& random selection of split variables / split points + \item Each tree yields a possible explanation for the observations \end{itemize} \end{block} \end{column} @@ -247,11 +249,11 @@ \section{Surrogate Models} \hfill \begin{column}{.48\textwidth} - \begin{block}{Predict} + \begin{block}{RF Prediction} \begin{itemize} - \item Obtain prediction of each tree + \item Predict with each tree \item Aggregate predictions (e.g., average) - \item Uncertainty of predictions: stdev across tree predictions + \item Uncertainty estimate:\\ \alert{empirical variance across tree predictions} \end{itemize} \end{block} \end{column} @@ -259,31 +261,51 @@ \section{Surrogate Models} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Random Forests - Hyperparameters} +\begin{frame}[c]{Random Forests (RFs): Impact of Basic Model Choices} +\vspace{-25pt} \begin{columns} - \column{0.5\textwidth} - \centering - w bootstrapping and\\ w/o random splits - \includegraphics[width=0.5\textwidth]{images/surrogate_models/rf_boot_middle_split.png} - - w bootstrapping and\\ w/ random splits - \includegraphics[width=0.5\textwidth]{images/surrogate_models/rf_boot_rand_split.png} - - \column{0.5\textwidth} - \centering - w/o bootstrapping and\\ w/o random splits - \includegraphics[width=0.5\textwidth]{images/surrogate_models/rf_noboot_middle_split.png} - - w/o bootstrapping and\\ w/ random splits - \includegraphics[width=0.5\textwidth]{images/surrogate_models/rf_noboot_rand_split.png} +\column{0.5\textwidth} + +\begin{figure}[h] +\captionsetup[subfigure]{position=top} +\centering + +\renewcommand{\thesubfigure}{a} +\subfloat[][no bootstrapping, \\ no random splits]{ +\includegraphics[width=0.55\textwidth, clip]{images/surrogate_models/rf_noboot_middle_split.png} + +} +\qquad +\renewcommand{\thesubfigure}{b} +\subfloat[][with bootstrapping, \\ no random splits]{ +\includegraphics[width=0.55\textwidth, clip]{images/surrogate_models/rf_boot_middle_split.png} +} +\end{figure} + +\column{0.5\textwidth} +\begin{figure}[h] +\captionsetup[subfigure]{position=top} +\centering + +\pause +\renewcommand{\thesubfigure}{c} +\subfloat[][no bootstrapping, \\ with random splits]{ +\includegraphics[width=0.55\textwidth, clip]{images/surrogate_models/rf_noboot_rand_split.png} + +} +\qquad +\renewcommand{\thesubfigure}{d} +\subfloat[][with bootstrapping, \\ with random splits]{ +\includegraphics[width=0.55\textwidth, clip]{images/surrogate_models/rf_boot_rand_split.png} +} + +\end{figure} \end{columns} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Random Forests - Summary} +\begin{frame}[c]{Random Forests (RFs): Overview of Pros and Cons} \begin{columns}[T] % align columns \begin{column}{.48\textwidth} @@ -291,10 +313,12 @@ \section{Surrogate Models} \begin{block}{Advantages} \begin{itemize} \item Cheap to train - \item Scales well with \#observations: + \item Scales well with \#observations $n$: \begin{itemize} - \item Worst-case complexity for $T$ tress with $n$ data points of dimensionality $p$: $\mathcal O(T\cdot p \cdot n^2 \log{n})$. + \item Fitting: $O(n \log n)$ + \item Prediction: $O(\log n)$ \end{itemize} + \item Scales well with \#dimensions \item Training can be parallelized \item Can easily handle conditional, categorical, continuous and discrete spaces \item Quite robust against its own hyperparameters @@ -314,28 +338,38 @@ \section{Surrogate Models} \end{itemize} \end{block} +\pause +\bigskip +\bigskip +\hspace*{0.5cm}These qualities make RFs a \alert{robust} \\ +\hspace*{0.5cm}\alert{option} for Bayesian optimization in \\ \hspace*{0.5cm}\alert{high dimensions}, for \alert{categorical spaces},\\ +\hspace*{0.5cm}or when function evaluations are quite fast + \end{column} \end{columns} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Bayesian Neural Networks} +\begin{frame}[c]{Bayesian Neural Networks: Overview} \begin{itemize} - \item Extend regression NNs to model uncertainty - \item Deal with all sources of parameter uncertainty: \pause + \item Neural networks are more flexible \& scalable than Gaussian processes + \item But for use in Bayesian optimization, neural networks need to be made probabilistic + +\pause + \item Bayesian deep learning aims to deal with all sources of uncertainty \pause \begin{itemize} - \item More than one weight vector can explain the observed data - \item Take into account all possible explanations + \item E.g., we don't have a single weight vector anymore, but a distribution over weights \end{itemize} \centering \includegraphics[width=0.6\textwidth]{images/surrogate_models/bnn.jpg} +\footnotesize{Image source: \lit{\href{http://proceedings.mlr.press/v37/blundell15.pdf}{Blundell et al. 2015}}} + \end{itemize} -\source{\href{http://proceedings.mlr.press/v37/blundell15.pdf}{Blundell et al. 2015}} + \end{frame} %----------------------------------------------------------------------- @@ -353,54 +387,50 @@ \section{Surrogate Models} %\end{itemize} % %\end{frame} + %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Bayesian Neural Networks - Approaches} +\begin{frame}[c]{Simplest Way of Incorporating Uncertainty in Neural Networks: DNGO} + \begin{itemize} - \item Standard Bayesian optimization model Gaussian process scales badly to a large number of data points - \item Bayesian Neural Networks can be used as a more scalable model\pause - \bigskip - \item Overview of Bayesian Neural Networks for BO: - \begin{itemize} - \item \lit{\href{https://arxiv.org/pdf/1502.05700.pdf}{Snoek et al. 2015}} Scalable Bayesian Optimization Using Deep Neural Networks - \item \lit{\href{https://www.ismll.uni-hildesheim.de/pub/pdfs/schilling2015-ecml.pdf}{Schilling et al. 2015}} Hyperparameter Optimization with Factorized Multilayer Perceptrons - \item \lit{\href{https://papers.nips.cc/paper/6117-bayesian-optimization-with-robust-bayesian-neural-networks.pdf}{Springenberg et al. 2016}} Bayesian Optimization with Robust Bayesian Neural Networks - \item \lit{\href{https://arxiv.org/abs/1706.01825}{Hern\'andez-Lobato et al. 2017}} Parallel and Distributed Thompson Sampling for Large-scale Accelerated Exploration of Chemical Space - \item \lit{\href{https://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning}{Perrone et al. 2018}} Scalable Hyperparameter Transfer Learning + \item Fit a standard regression neural network to the data (with a linear output layer) + \item Use the representation in the last hidden layer as \alert{basis functions $\phi(x)$} of the input $x$ + \item \alert{Use Bayesian linear regression with these basis functions} \pause + \begin{itemize} + \item The last layer is linear in its parameters $\theta$ + \item Therefore, the Bayesian linear regression formulas work directly + \item Feasible in closed form, in time $O(N d^3)$, where $N$ is the number of data points\\ and $d$ is the number of hidden units in the last layer + \end{itemize} + \item Not fully Bayesian yet, but already allows scalable Bayesian optimization \lit{\href{https://arxiv.org/pdf/1502.05700.pdf}{Snoek et al. 2015}} - \end{itemize} \end{itemize} +%\vspace{1cm} +%\hspace{12cm} +%\lit{\href{https://arxiv.org/pdf/1502.05700.pdf}{Snoek et al. 2015}} + \end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Bayesian Neural Networks - DNGO} -First approach: \href{https://arxiv.org/pdf/1502.05700.pdf}{Scalable Bayesian Optimization Using Deep Neural Networks} +%----------------------------------------------------------------------- + +\begin{frame}[c]{Bayesian Optimization with BNNs: Overview of Existing Approaches} \begin{itemize} - \item Fit a standard neural network to the data - \item Use the representation in the last hidden layer as basis \\ functions $\phi(x)$ of the input $x$ - \item Use Bayesian linear regression for the output layer \pause - \begin{itemize} - \item The last layer is linear in its parameters $\theta$ - \item Therefore, the Bayesian linear regression formulas work directly - \item Feasible in closed form, in time $O(N d^3)$, \\ where $N$ is the number of data points and $d$ is the number of \\ hidden units in the last layer - \end{itemize} - \item Not fully Bayesian yet -\end{itemize} + \item \lit{\href{https://arxiv.org/pdf/1502.05700.pdf}{Snoek et al. 2015}} Scalable Bayesian Optimization Using Deep Neural Networks (DNGO) + \item \lit{\href{https://papers.nips.cc/paper/6117-bayesian-optimization-with-robust-bayesian-neural-networks.pdf}{Springenberg et al. 2016}} Bayesian Optimization with Robust Bayesian Neural Networks + \item \lit{\href{https://arxiv.org/abs/1706.01825}{Hern\'andez-Lobato et al. 2017}} Parallel and Distributed Thompson Sampling for Large-scale Accelerated Exploration of Chemical Space +\bigskip +\pause + \item \lit{\href{https://www.ismll.uni-hildesheim.de/pub/pdfs/schilling2015-ecml.pdf}{Schilling et al. 2015}} Hyperparameter Optimization with Factorized Multilayer Perceptrons + \item \lit{\href{https://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning}{Perrone et al. 2018}} Scalable Hyperparameter Transfer Learning -\vspace{1cm} -\hspace{12cm} -\lit{\href{https://arxiv.org/pdf/1502.05700.pdf}{Snoek et al. 2015}} +\end{itemize} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Bayesian Neural Networks} +\begin{frame}[c]{Bayesian Neural Networks (BNNs): Overview of Pros and Cons} \begin{columns}[T] % align columns \begin{column}{.48\textwidth} @@ -408,56 +438,71 @@ \section{Surrogate Models} \begin{block}{Advantages} \begin{itemize} \item Scales linearly with \#observations - \item Given enough network samples obtain nice and smooth uncertainty estimates - \item Can handle categorical, continuous and discrete spaces + \item Can obtain nice and smooth uncertainty estimates + \item Flexibility: handling of categorical, continuous and discrete spaces \end{itemize} \end{block} + +\onslide<3->{ +\bigskip +\bigskip +\hspace*{0.5cm}These qualities make BNNs an \\ +\hspace*{0.5cm}\alert{ever-more promising alternative} +%\hspace*{0.5cm}Needed: robust auto-tuned(?) implementation +} \end{column}% \hfill% -\pause \begin{column}{.48\textwidth} +\onslide<2->{ \begin{block}{Disadvantages} \begin{itemize} - \item Poorer uncertainty estimates + \item Usually needs more data than Gaussian processes + \item Uncertainty estimates often worse than for Gaussian processes \item Many meta-design decisions \item No robust off-the-shelf implementation - \item Usually needs more data than Gaussian processes \end{itemize} \end{block} - +} \end{column} \end{columns} \end{frame} %----------------------------------------------------------------------- -\begin{frame}[c]{Surrogate Models} -\framesubtitle{Bayesian Neural Networks - Further Reading} +\begin{frame}[c]{Bayesian Neural Networks (BNNs): Further Reading} -There's a lot more which hasn't been applied to Bayesian optimization yet +There is a lot more work on BNNs that hasn't been applied to Bayesian optimization yet: \begin{itemize} - \item \lit{\href{https://www.cs.utoronto.ca/~radford/bnn.book.html}{Neal 1995}} Bayesian Learning for Neural Networks - \item \lit{\href{https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf}{Bishop 2006}} Pattern Recognition and Machine Learning - \item \lit{\href{https://papers.nips.cc/paper/7219-simple-and-scalable-predictive-uncertainty-estimation-using-deep-ensembles.pdf}{Lakshminarayanan et al. 2017}}: Simple and Scalable Predictive Uncertainty Estimation using Deep Ensembles - \item \lit{\href{https://arxiv.org/abs/1506.02142}{Gal et al. 2016}} Dropout as a Bayesian Approximation: +% \item \lit{\href{https://www.cs.utoronto.ca/~radford/bnn.book.html}{Neal 1995}} Bayesian Learning for Neural Networks +% \item \lit{\href{https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf}{Bishop 2006}} Pattern Recognition and Machine Learning + \item Ensembles obtained simply by running SGD several times \lit{\href{https://arxiv.org/abs/1612.01474}{Lakshminarayanan et al. 2017}} + %: Simple and Scalable Predictive Uncertainty Estimation using Deep Ensembles + \item Dropout \lit{\href{https://arxiv.org/abs/1506.02142}{Gal et al. 2016}} + %Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning - \item \lit{\href{https://arxiv.org/abs/1802.06455}{Teye et al. 2018}} Bayesian Uncertainty Estimation for Batch Normalized Deep Networks - \item \lit{\href{https://arxiv.org/abs/1704.00109}{Gao Huang et al. 2017}} Snapshot Ensembles: Train 1, get M for free + \item Monte Carlo Batch Normalization \lit{\href{https://arxiv.org/abs/1802.06455}{Teye et al. 2018}} %Bayesian Uncertainty Estimation for Batch Normalized Deep Networks + \item Snapshot Ensembles \lit{\href{https://arxiv.org/abs/1704.00109}{Gao Huang et al. 2017}} + %Snapshot Ensembles: Train 1, get M for free \end{itemize} -or maybe doesn't work and hasn't been published + + \end{frame} %----------------------------------------------------------------------- \begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} \begin{itemize} %GP - \item \emph{Repetition.} What are the most important hyperparameters of a GP that you would want to optimize for Bayesian Optimization? +% \item \alert{Repetition.} What are the most important hyperparameters of a GP that you would want to optimize for Bayesian Optimization? %RF - \item \emph{Discussion.} For which optimization problems would you rather use a RF than a GP? - %BNN - \item \emph{Discussion.} Can a BNN be trained with standard MCMC in theory and in practice? + \item \alert{Discussion.} For which optimization problems would you rather use a RF than a GP? When would you use a BNN? +\medskip +%BNN +% \item \alert{Discussion.} Can a BNN be trained with standard MCMC in theory and in practice? %DNGO - \item \emph{Discussion.} Why can't you use a Bayesian Linear Regression for all layers in a Deep Neural Network? + \item \alert{Discussion.} Why can DNGO's Bayesian Linear Regression approach only be applied to the last layer of a Deep Neural Network, not to all layers? +\medskip + \item \alert{Open Research Project.} All of the surrogate models we saw have pros and cons. It would be interesting to select the best model (and its hyperparameters) dependent on the data at hand. + \end{itemize} -\end{frame} +\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t05_extensions.tex b/w06_hpo_bo/t05_extensions.tex deleted file mode 100644 index fb3ac54..0000000 --- a/w06_hpo_bo/t05_extensions.tex +++ /dev/null @@ -1,699 +0,0 @@ -%----------------------------------------------------------------------- -\section{Extensions to Bayesian Optimization} - -\begin{frame}{For what do we need extensions?} -\begin{block}{Standard BO Problem:} -\begin{itemize} - \item Sequential optimization - \pause - \item Continuous, smooth functions - \item Noise-free evaluations - \item No constraints -\end{itemize} -\end{block} -\begin{block}{\emph{Exotic} BO Problem:} -\begin{itemize} - \item Categorical hyperparameters - \item Disconnected search spaces - \item Parallel evaluations - \item Noisy evaluations - \item Optimization with constraints - \item Multi-objective Bayesian optimization -\end{itemize} -\end{block} -\end{frame} - -%\begin{frame}[c]{Categorical and Conditional Parameters} -%\framesubtitle{Introduction} -%\begin{itemize} -% \item<+->{Our parameter configuration space $\pcs$ can possibly contain: -% \begin{itemize} -% \item<+->{Neural Network Architectures.} -% \item<+->{Model-specific parameters.} -% \item<+->{General optimization parameters.} -% \end{itemize} -% } -% \item<+->{Consider searching through such a space of parameters. Is every individual dimension of this search space- -% \begin{itemize} -% \item<+->{Continuous?} -% \item<+->{Relevant?} -% \end{itemize} -% } -%\end{itemize} -%\end{frame} -%----------------------------------------------------------------------- -%\begin{frame}[c]{Categorical and Conditional Parameters} -%\framesubtitle{Categorical Parameters} -%\begin{itemize} -% \item<+-> Parameters that draw values from a discrete domain instead of a real-valued domain. -% \item<+-> Mathematically, a parameter $\hyperparam$ is a categorical parameter if $\hyperparam\in P$, where $P=\{p_1, p_2, \dots\}$ is a set of finite, discrete values. -% \item<+-> Examples: -% \begin{itemize} -% \item<+-> For training a neural network, we may choose one flavor of SGD out of $\{Vanilla, \,RMSProp, \,Adam\}$. -% \item<+-> For a layer in a Multi-Layer Perceptron, we may choose one activation function out of $\{tanh, \,sigmoid, \,relu, \,unit\}$. -% \end{itemize} -% \item<+-> Categorical parameters present a challenge: inferring gradients is not possible for unordered categories! -% \item<+-> Another challenge: Each individual category, or possible value of a categorical parameter, contributes to the curse of dimensionality in naive search approaches. -%\end{itemize} -%\end{frame} -%%----------------------------------------------------------------------- -%\begin{frame}[c]{Categorical and Conditional Parameters} -%\framesubtitle{Hamming Distance Kernel} -%\begin{center} -%Placeholder - Describe Hamming Distance Kernel from Frank's PhD thesis, include visualization -%\end{center} -%\end{frame} -%----------------------------------------------------------------------- -%\begin{frame}[c]{Categorical and Conditional Parameters} -%\framesubtitle{Conditional Parameters} -%\begin{itemize} -% \item<+-> Some parameters in the search space are only relevant in the context of specific values of other parameters. -% \item<+-> For example, if we are training a Neural Network using SGD, the momentum parameter is only relevant when using a flavour of SGD that supports it, such as Adam, -% \item<+-> Such parameters can be used to define conditional dependencies between parameters -% \item<+-> These dependencies define active/inactive sub-spaces within the search space -% \item<+-> Conditional parameters are most recognizable in the context of categorical parameters, but they need not be categorical -% \item<+-> Similar to categorical parameters, inferring gradients is not possible due to the presence of active/inactive sub-spaces -%\end{itemize} -%\end{frame} -%----------------------------------------------------------------------- -% \begin{frame}[c]{Categorical and Conditional Hyperparameters} -% \framesubtitle{Structured Search Spaces} -% \begin{itemize} -% \item<+-> In HPO, we have prior knowledge about when some hyperparameters in the search space are completely irrelevant -% \item<+-> Naively searching over the entire search space while disregarding any conditional dependencies is inefficient -% \item<+-> We can impose a structure over the search space with the help of conditional dependencies between the various parameters to speed-up and optimize the HPO task -% \end{itemize} -% \end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Categorical and Conditional Hyperparameters} -\framesubtitle{Structured Search Spaces} -\begin{center} - \includegraphics[width=.9\linewidth, height=0.9\textheight, keepaspectratio=true]{w06_hpo_bo/images/categ_cond_params/Conditional Parameters AutoML Book.png} - \newline - Example of a structured search space (Source: Figure 5.1 of the \lit{\href{https://www.automl.org/wp-content/uploads/2019/05/AutoML_Book.pdf}{AutoML book}}) -\end{center} -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Categorical and Conditional Hyperparameters} -\framesubtitle{Categorical Hyperparameters} -Categorical hyperparameters are discrete hyperparameters that \emph{can not be sorted} and the \emph{distances between values might vary}. This has to be taken into account by the surrogate model: -% -\pause -% -\begin{columns}[T] -\column{0.65\textwidth} -\begin{itemize} - \item <+-> \emph{One-hot} encoding provides a general solution - \item <+-> Random Forests \emph{natively} handle categorical inputs - \item <+-> Neural networks can learn \emph{entity embeddings} for categorical inputs \lit{\href{https://arxiv.org/pdf/1604.06737.pdf}{Guc et al. 2016}} - \item <+-> Gaussian Processes can make use of the (weighted) \emph{Hamming Distance Kernel}: \lit{\href{https://www.cs.ubc.ca/~hutter/papers/Hutter09PhD.pdf}{Hutter 2009}} -\begin{equation*} - \kernel_{\theta}(\conf_i, \conf_j) = \exp{\sum_{l=1}^d (-\theta \cdot \delta(\hyperparam_{i,l} \neq \hyperparam_{j,l}))} -\end{equation*} -\end{itemize} -% -\column{0.35\textwidth} -\vspace{0.5cm} -\includegraphics[width=1\textwidth]{w06_hpo_bo/images/categ_cond_params/categorical.png} -% -\end{columns} -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Categorical and Conditional Hyperparameters} -\framesubtitle{Conditional Hyperparameters} - -A conditional hyperparameter is only relevant if another hyperparameter takes on a specific value and thus should be ignored by the model if not active: - -\begin{columns}[T] -\column{0.65\textwidth} - -\begin{itemize} - \item <+-> Setting the values for inactive hyperparameter to a specific value (e.g. $0$) - \item <+-> Random Forests \lit{\href{https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf}{Hutter et al. 2011}} and Tree Parzen Estimators \lit{\href{http://papers.nips.cc/paper/4443-algorithms-for-hyper}{Bergstra et al. 2011}} can natively handle conditional inputs - \item <+-> Furthermore, there exist several kernels for Gaussian Processes to handle conditional input \lit{\href{https://arxiv.org/abs/1310.5738}{Hutter et al. 2013}} \lit{\href{https://www.etsmtl.ca/Unites-de-recherche/LIVIA/Recherche-et-innovation/Publications/Publications-2017/Levesque_ijcnn_2017.pdf}{Lévesque et al. 2017}} \lit{\href{http://proceedings.mlr.press/v70/jenatton17a.html}{Jenatton et al. 2017}} -\end{itemize} -% -\column{0.35\textwidth} -\vspace{0.5cm} -\includegraphics[width=1\textwidth]{w06_hpo_bo/images/categ_cond_params/conditional.png} -% -\end{columns} -\pause -\vspace{0.5cm} -$\xrightarrow{}$ Searching in structured search spaces is still an active research topic and is far from being solved -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{Motivation} - -\begin{itemize} -% \item Bayesian Optimization success stories: -% \begin{itemize} -% \item robotics, planning, recommendation, automatic algorithm configuration etc. -% \end{itemize} -% \pause - \item Issue: BO works best on problems of moderate dimensions $d\leq20$ -% \pause - \begin{itemize} - \item Standard GPs do not tend to fit well in high dimensions - \item Maximizing the acquisition function is also computationally challenging - \end{itemize} -\medskip -\pause - - \item Possible solutions we will discuss: - \begin{itemize} - \item Embedding into a low-dimensional space (REMBO) \lit{\href{https://ml.informatik.uni-freiburg.de/papers/16-JAIR-REMBO.pdf}{Wang et al. 2016}} - \item Additive models \lit{\href{http://proceedings.mlr.press/v37/kandasamy15.pdf}{Kandasamy et al. 2015}} - \item Random Forests \lit{\href{https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf}{Hutter et al. 2011}} - \end{itemize} -\end{itemize} - - -\end{frame} - -%---------------------------------------------------------------------- - -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{The Curse of Dimensionality vs. Low Effective Dimensionality} -\begin{itemize} - \item Good coverage of $\pcs$ is required to ensure that the global optimum is found - \item The number of evaluations needed to cover $\pcs$ increases exponentially with dimensionality - \item Numerous optimization problems in practice have \emph{"low effective dimensionality"} - \begin{itemize} - \item E.g., HPO for neural networks and deep belief networks - \lit{\href{http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf}{Bergstra et al. 2012}} - \item E.g., algorithm configuration for combinatorial optimization solvers \lit{\href{http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf}{Hutter et al. 2014}} - \end{itemize} -\pause -\medskip - \item Idea: Exploit low effective dimensionality to cover a lower-dimensional space well -\end{itemize} - -\end{frame} - -%---------------------------------------------------------------------- -%---------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{Random Embeddings in a nutshell} - -Given a $D=2$ dimensional black-box function $\cost(x_{1},x_{2})$: -\begin{itemize} -\begin{columns}[T] -\begin{column}{0.45\linewidth} - - - \item Assume we know $\cost$ has only $d=1$ important dimensions, but we don't know which one it is. - \end{column} - \begin{column}{0.5\linewidth} - \begin{figure} - \includegraphics[width=0.5\textwidth]{images/highdim_images/Random embeddings in a nutshell1.png} - \end{figure} - \end{column} -\end{columns} - \pause - \begin{columns}[T] - \begin{column}{0.45\linewidth} - \vspace{-1em} - \item Subspace $x_1=x_2$ is guaranteed to include the optimum. - \end{column} - \begin{column}{0.5\linewidth} - \begin{figure} - \includegraphics[width=0.5\textwidth]{images/highdim_images/Random embeddings in a nutshell2.png} - \end{figure} - \end{column} -\end{columns} - \pause -\begin{columns} -\begin{column}{0.45\linewidth} - \vspace{-8em} - \item Idea applies to any $d$-dimensional linear subspace - \item Allows scaling to arbitrary $D$ (e.g., $D=1$ billion) -\end{column} -\begin{column}{0.5\linewidth} - -\end{column} -\end{columns} -\end{itemize} - - -\end{frame} - -%---------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{Random Embedding Bayesian Optimization (REMBO)} -\begin{columns}[T] -\begin{column}{0.5\textwidth} -\begin{itemize} - \item Generate a random matrix $A \in \realnum^{D \times d}$ - \item Choose a bounded region set $\obsspace\subset\realnum^d$ - \item Use BO to optimize $g(\conf)=\cost(\pmb{Ay})$ instead of high dimensional $\cost(\conf)$ -\end{itemize} -\end{column} -\begin{column}{0.5\textwidth} -\begin{figure} - \includegraphics[width=0.8\textwidth]{images/highdim_images/Embedding.png} -\end{figure} -\end{column} - -\end{columns} -\end{frame} - -%---------------------------------------------------------------------- - -%---------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{REMBO- Pseudocode} - - -\begin{algorithm}[H] - \SetAlgoLined - \setcounter{AlgoLine}{0} - \SetKwInOut{Require}{Require} - \SetKwInOut{Result}{Result} - - \Require{Search space $\pcs$, cost function $\cost$, acquisition function $\acq$, predictive model $\surro$, maximal number of function evaluations $\bobudget$} - \Result{Best observed configuration $\finconf$ according to $\iter[\bobudget]{\dataset}$ or $\surro$} - - \textcolor{blue}{Generate a random matrix $\pmb{A} \in \realnum^{D\times d}$}\; - - \textcolor{blue}{Choose the bounded region set $\obsspace\subset\realnum^d$}\; - - $\iter[0]{\dataset} \leftarrow \varnothing$\; - - \For{$\bocount=1$ \KwTo $\bobudget$}{ - $\iter[\bocount]{\surro} \leftarrow$ fit predictive model on $\iter[\bocount-1]{\dataset}$\; - - \textcolor{blue}{$\pmb{y} \leftarrow \pmb{y} \in \argmax_{\pmb{y}\in\obsspace} \acq(\pmb{y}|\iter[\bocount-1]{\dataset}, \iter[\bocount]{\surro})$}\; - - Query $\cost(\textcolor{blue}{\pmb{A}\pmb{y}})$; - - $\iter[\bocount]{\dataset} \leftarrow \iter[\bocount-1]{\dataset} \cup \{\langle \textcolor{blue}{\pmb{A}\pmb{y}}, \cost(\textcolor{blue}{\pmb{A}\pmb{y}}) \rangle \}$\; - } - \caption{REMBO: Bayesian Optimization with Random Embedding.} -\end{algorithm} -\end{frame} -%---------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{Random Embedding Bayesian Optimization - Summary} -\begin{columns}[T] % align columns -\begin{column}{.48\textwidth} - - - \begin{block}{Advantages} - \begin{itemize} - \item Exploits low effective dimensionality - \item Allows scaling to very high dimensions - \item Applies to both continuous and categorical variables - \item Trivial modification of BO algorithm - \item Coordinate independent (invariant under rotations) - \end{itemize} - \end{block} -\pause -\end{column}% - -\hfill% - -\begin{column}{.48\textwidth} - - \begin{block}{Disadvantages} - \begin{itemize} - \item Sensitive to the definition of the bounded low dimensional constrained space $\obsspace$ - \item Assumes truly unimportant dimensions - %Limits a high dimensional function in a low dimensional embedding - %\item Sensitive to the effective dimension - \end{itemize} -\end{block} - -\end{column} -\end{columns} -\end{frame} - -%---------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{High Dimensional Bayesian Optimisation and Bandits via Additive Models} - -\vspace{2em} -\begin{itemize} - \item Recall: - - \begin{itemize} - \item Standard GPs do not tend to fit well in high dimensions - \item Maximizing the acquisition function is also computationally challenging - \pause - \end{itemize} -\medskip - \item Idea: - \begin{itemize} - \item Assume additive structure of the objective function - \begin{equation*} - f(\conf)=f^{(1)}(\conf^{(1)})+f^{(2)}(\conf^{(2)})+...+f^{(M)}(\conf^{(M)}) - \end{equation*} - \item Use an alternative acquisition function which applies to an additive kernel - \end{itemize} -\end{itemize} - -\end{frame} - -%---------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{Additive GP Models in a nutshell} - \begin{equation*} - f(\conf)=f^{(1)}(\conf^{(1)})+f^{(2)}(\conf^{(2)})+...+f^{(M)}(\conf^{(M)}) - \end{equation*} -\begin{itemize} -\begin{columns}[T] -\begin{column}{0.45\linewidth} - -\hspace{2em} - \item Key assumption: $f$ decomposes into lower-dimensional additive components $\conf^{(j)} \in \pcs^{(j)}, j \in \{1,..,M\}$ - \item The decompositions are disjoint $\conf^{(i)} \cap \conf^{(j)} = \varnothing$ - \pause - \item Each decomposition $f^{(j)}(\conf^{(j)})$ is modelled by an individual GP - \pause - \end{column} - \begin{column}{0.45\linewidth} - \begin{figure} - \includegraphics[width=0.7\textwidth]{images/highdim_images/additive-models.png} - \caption{Decomposition in two additive components (M=2) \\ Source: \lit{\href{http://proceedings.mlr.press/v37/kandasamy15.pdf}{Kandasamy et al. 2015}}} - \end{figure} - \end{column} -\end{columns} -\end{itemize} -\end{frame} -%---------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{Additive GP-UCB} -\begin{itemize} - - \item Idea: Represent acquisition function as sum of functions on decompositions: - \begin{equation*} - \acq_{t}(\conf) = \sum_{j}\acq_{t}^{(j)}(\conf^{(j)}) - \end{equation*} -\pause -\medskip - \item $\acq_{t}$ is maximized by maximizing each $\acq_{t}^{(j)}$ separately: - \begin{equation*} - \hat{\varphi}_{t}^{(j)}(\conf^{(j)}) = \mean_{t-1}^{(j)}(\conf^{(j)}) + \beta_{t}^{1/2}\stddev_{t-1}^{(j)}(\conf^{(j)}) - \end{equation*} - \item Authors have used UCB for this work, but other acquisition functions are possible, too. -\end{itemize} -\source{\href{http://proceedings.mlr.press/v37/kandasamy15.pdf}{Kandasamy et al. 2015}} -\end{frame} -%---------------------------------------------------------------------- -% \begin{frame}[c]{High Dimensional Bayesian Optimization} -% \framesubtitle{Additive GP-UCB- Pseudocode} -% \begin{algorithm}[H] -% %\DontPrintSemicolon -% \LinesNumbered -% \SetAlgoLined -% \setcounter{AlgoLine}{0} -% \SetKwInOut{Input}{Input} - -% %\Input{Kernels $\kernel^{(1)},...,\kernel^{(M)}$, Decomposition $(\pcs^{(j)})_{j=1}^{M}$}\\ -% \Input{ Kernels $\kernel^{(1)},...,\kernel^{(M)}$, Decomposition $(\pcs^{(j)})_{j=1}^{M}$ -% $\dataset_{0}\leftarrow\varnothing$} -% \For{$j=1,...,M$, $(\mean_0^{(j)},\kernel_0^{(j)})\leftarrow(0,\kernel^{(j)})$.}{ -% \For{$j=1,...,M$,}{ -% $\confI{t}_{(j)}\leftarrow\argmax_{z\in\pcs^{(j)}}\mean_{t-1}^{(j)}(z) +\sqrt{\beta_{t}}\stddev_{t-1}^{(j)}(z)$;\ - -% $\confI{t}\leftarrow\bigcup_{j=1}^{M} \confI{t}_{(j)}$;\ - -% $\obs\leftarrow$ Query $\cost$ at $\confI{t}$;\ - -% $\dataset_{t}=\dataset_{t-1}\cup\{(\confI{t},\obs)\}$;\ - -% Perform Bayesian Optimization posterior updates conditioned on $\dataset_{t}$ to obtain $\mean_{t}^{(j)},\stddev_{t}^{(j)}$ for $j=1,...,M$;\ -% } -% } -% \caption{Add-GP-UCB} -% \end{algorithm} -% \end{frame} - - -%---------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{High dimensional BO via additive models - Summary} -\begin{columns}[T] % align columns -\begin{column}{.48\textwidth} - - - \begin{block}{Advantages} - \begin{itemize} - \item Exploits low effective dimensionality - \item Scales GP-UCB to high dimensional parameter spaces - \item Regret is linearly dependent on the dimension D when $\cost$ is additive - \item Add-GP-UCB applies to an additive kernel - \end{itemize} - \end{block} -\pause -\end{column}% - -\hfill% - -\begin{column}{.48\textwidth} - - \begin{block}{Disadvantages} - \begin{itemize} - \item Relies on structural assumptions about the objective function - \item Restricted to an axis aligned representation - \item Sensitive on the number of additive components - \end{itemize} -\end{block} - -\end{column} -\end{columns} -\end{frame} - -%--------------------------------------------------------------------- -\begin{frame}[c]{High Dimensional Bayesian Optimization} -\framesubtitle{Random Forests} -BO with random forests as surrogates has been shown to perform well in high dimensions: -\begin{itemize} - \item Can handle complex parameter spaces: - \begin{itemize} - \item High dimensionality (low effective dimensionality) - \item Mixed continuous/discrete parameters - \item Conditional parameters - \end{itemize} -\pause -\smallskip - \item Can handle non-standard noise - \begin{itemize} - \item Non-Gaussian noise - \item Heteroscedastic noise - \end{itemize} -\pause -\smallskip - \item Effective model for off-the-shelf Bayesian optimization - \begin{itemize} - \item Robustness of the model - \item Model overhead - \end{itemize} -\pause -\smallskip - \item SMAC is a Bayesian optimization tool that uses random forests as a surrogate model -\end{itemize} -\end{frame} -%---------------------------------------------------------------------- -\begin{frame}[c]{Parallel Bayesian Optimization} -\framesubtitle{Multi-point Acquisition Functions} - -Often, there are many parallel compute resources available, but Bayesian optimization works sequentially. Extending Bayesian optimization to a parallel setting is \emph{not trivial}. -\pause -\begin{itemize} - \item <+-> To select a batch of points in parallel we need to compute the multi-point acquisition function, e.g. Expected Improvement: - \begin{equation*} - q\text{-EI}(\conf_{1, \dots, q}) = \E \left[ \cost(\incumbent) - \min_{i=1, \dots, q} \surro(\conf_i) \right] - \end{equation*} - \item <+-> For EI and KG, this requires \emph{expensive-to-compute} q-dimensional Gaussian cumulative distributions and is thus also \emph{expensive to maximize}. \lit{\href{https://hal.archives-ouvertes.fr/hal-00260579/document}{Ginsbourger et al. 2007}}, \lit{\href{https://arxiv.org/pdf/1606.04414v4.pdf}{Wu et al. 2018}}, \lit{\href{https://arxiv.org/abs/1805.10196}{Wilson et al. 2018}}, \lit{\href{https://arxiv.org/pdf/1602.05149.pdf}{Wang et al. 2019}} -\end{itemize} -\pause -\end{frame} -%---------------------------------------------------------------------- -\begin{frame}[c]{Parallel Bayesian Optimization} -\framesubtitle{Constant Liar, Kriging Believer and Fantasies} -Assume we do not want to select a batch of points, but to choose the next $\conf$ while there are still \emph{pending evaluations}. -\pause -% -A conceptually simple method is to assume observations for the outstanding evaluations and perform sequential optimization: -\pause -\begin{itemize} - \item <+-> \emph{Constant Liar}: Choose a fixed value (constant) \lit{\href{http://www.cs.ubc.ca/labs/beta/EARG/stack/2010_CI_Ginsbourger-ParallelKriging.pdf}{Ginsbourger et al. 2010}} - \item <+-> \emph{Kriging Believer}: Use the current mean prediction (belief) \lit{\href{http://www.cs.ubc.ca/labs/beta/EARG/stack/2010_CI_Ginsbourger-ParallelKriging.pdf}{Ginsbourger et al. 2010}} - \item <+-> \emph{Fantasies}: Use Monte Carlo estimates (fantasies, more details on the next slide). -\end{itemize} -\end{frame} -%---------------------------------------------------------------------- -\begin{frame}[c]{Parallel Bayesian Optimization} -\framesubtitle{Constant Liar, Kriging Believer and Fantasies} -% -Assume we have observed data $\left\{ \left\langle \bonextsample, \bonextobs \right\rangle \right\}^{N}_{\bocount = 1}$ and $J$ evaluations are pending $\left \{\conf_{j} \right \}^{J}_{j = 1}$. We can compute the expected mean function using the following integral: \pause -\begin{equation*} -\begin{aligned} - \Bar{\acq} \left( \conf; \left\{ \left\langle \bonextsample, \bonextobs \right\rangle \right\}^{N}_{\bocount = 1}, \left \{\conf_{j} \right \}^{J}_{j = 1} \right) = \pause - \int_{\mathbb{R}^J} \acq \left( \conf; \left\{ \left\langle \bonextsample, \bonextobs \right\rangle \right\}^{N}_{\bocount = 1}, \left \{ \left\langle \conf_j, \obs_j \right\rangle \right \}^{J}_{j=1} \right) \\ - p( \{ \obs_j \}^{J}_{j = 1} \rvert \{ \conf_j \}^{J}_{j = 1}, \left \left\{ \left\langle \bonextsample, \bonextobs \right\rangle \right\}^{N}_{\bocount = 1} )d\obs_1 \dots d\obs_J -\end{aligned} -\end{equation*} -% -\vspace{-0.8cm} -\begin{columns} -\column{0.6\textwidth} -\only<4->{ -\begin{enumerate} - \only<4->{\item Evaluated observations: $\left \{\conf_1, \conf_3, \conf_4 \right \}$, pending: $\left \{\conf_2, \conf_5 \right \}$.} - \only<5->{\item Fit a model for each possible realization of $\left \{\cost(\conf_2), \cost(\conf_5) \right \}$.} - \only<6->{\item Calculate acquisition function for each model.} - \only<7->{\item Integrate all acquisition functions over $\conf$.} -\end{enumerate} -} - -\column{0.4\textwidth} - \only<4-5>{ - \begin{figure} - \centering - \includegraphics[width=0.8\textwidth]{w06_hpo_bo/images/parallel/parallel_a.jpg} - \end{figure} - } - \only<6>{ - \begin{figure} - \centering - \includegraphics[width=0.8\textwidth]{w06_hpo_bo/images/parallel/parallel_b.jpg} - \end{figure} - } - \only<7->{ - \begin{figure} - \centering - \includegraphics[width=0.8\textwidth]{w06_hpo_bo/images/parallel/parallel_c.jpg} - \end{figure} - } -\end{columns} - -\source{\href{https://papers.nips.cc/paper/4522-practical-bayesian-optimization-of-machine-learning-algorithms.pdf}{Snoek et al. 2012}} - -\end{frame} -% -%\begin{frame} -%\begin{itemize} -% \item <+-> Utilize tractable properties of GP to get Monte Carlo estimates of %acquisition function under different results from pending function evaluations. \pause -% \item <+-> Consider the case where $N$ evaluations have completed, with data $\left \{\bonextsample, \bonextobs \right \}^{N}_{\bocount = 1}$ and $J$ evaluations are pending $\left \{\conf_{j} \right \}^{J}_{j = 1}$: \pause -% \begin{equation*} -% \begin{aligned} -% \hat{\acq} ( \conf; \left \{ \bonextsample, \bonextobs \right \}, \left \{ \conf_j \right \} ) = \pause -% \int_{\mathbb{R}^J} \pause \acq ( \conf; \left \{ \bonextsample, %\bonextobs \right \}, \left \{ \conf_j, \obs_j \right \} ) \\ \pause -% p(\left \{ \obs_j \right \}^{J}_{j = 1} \rvert \left \{ \conf_j \right \}^{J}_{j = 1}, \left \{ \bonextsample, \bonextobs \right \}^{N}_{\bocount=1} )d\obs_1 \dots d\obs_J -% \end{aligned} -% \end{equation*} -%\end{itemize} - -%\source{\href{https://csc2541-f17.github.io/}{Scalable and Flexible Models of Uncertainty, University of Toronto}} - -%\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Further extensions} -\framesubtitle{Noisy Evaluations} - -\small - -\begin{columns}[T] - - \column{0.6\textwidth} - - \begin{itemize} - \item Given noisy evaluations, GP regression proceeds similarly to the noiseless case by adding the variance to the diagonal of the covariance matrix. - \vspace{-0.2cm} - \item KG and ES directly handle noisy function evaluations~\lit{\href{https://arxiv.org/abs/1807.02811}{Frazier 2018}}. - \vspace{-0.2cm} - \item Computing EI with observation noise is challenging: - \vspace{-0.2cm} - \begin{itemize} - \item Uncertainty about which point is the current incumbent $\incumbent[\bocount-1]$. - \item Uncertainty about the costs $\cost(\conf_i)$. - \end{itemize} - \end{itemize} - - \column{0.4\textwidth} - \includegraphics[width=\textwidth]{images/extensions/BO_Loop_Noisy3.png} - -\end{columns} -\begin{itemize} - \item Noisy Expected Improvement~\lit{\href{https://arxiv.org/abs/1706.07094}{Letham et al. 2019}} extends the regular Expected Improvement by integrating over the predictive posterior of the model: - % There's an additional paper by Gramacy and Lee from 2011 which does a more complicated treatment of noise in EI - \begin{equation*} - \acq_{NEI}(\conf|\dataset)=\int_{\surro}\acq_{EI}(\conf|\surro)p(\surro|\dataset)\text{d}\surro - \end{equation*} - \vspace{-0.2cm} - \begin{itemize} - \item Compute with Monte Carlo Integration. - \item Each sample from the model posterior has its own incumbent $\incumbent[\bocount-1]$. - \end{itemize} - \end{itemize} -\end{frame} - -%---------------------------------------------------------------------- - -\begin{frame}[c]{Further extensions} -\framesubtitle{Constrained Bayesian Optimization} - -\begin{columns}[T] - -\column{0.6\textwidth} - -Four types of constraints -\begin{small} -\begin{enumerate} - \item Known constraints: can be accounted for when optimizing $\acq$ - %\item Policy constraints: function value is observed, but deemed forbidden~\lit{\href{https://www.soe.ucsc.edu/sites/default/files/technical-reports/UCSC-SOE-10-10.pdf}{Lee and Gramacy 2010}} - \item Hidden constraints: no function value is observed due to a failed function evaluation~\lit{\href{https://www.soe.ucsc.edu/sites/default/files/technical-reports/UCSC-SOE-10-10.pdf}{Lee and Gramacy 2010}} - \item Unknown constraints: there's an additional, but unknown constraint function, for example the memory used, which can be observed and modeled -\end{enumerate} -\end{small} - -\column{0.4\textwidth} -\includegraphics[width=0.9\textwidth]{images/extensions/notebooks_constrained_bo_4_0.png}\\ - \footnotesize{Hidden constraints. Image source: \lit{\href{https://gpflowopt.readthedocs.io/en/latest/notebooks/constrained_bo.html}{GPFlowOpt Tutorial, Apache 2 License}}} - -\end{columns} - -Most general solution: \emph{Expected Constrained Improvement}~\lit{\href{https://www.soe.ucsc.edu/sites/default/files/technical-reports/UCSC-SOE-10-10.pdf}{Lee and Gramacy 2010}}: -\vspace{-0.1cm} -\begin{equation} - ECI(\conf) = EI(\conf)h(\conf), -\end{equation} -\vspace{-0.1cm} -where $h(\conf)$ is the probability that $\conf$ is a valid configuration. - -\vspace{0.1cm} -Further literature in \lit{\href{https://arxiv.org/abs/1807.02811}{Frazier 2018}} and \lit{\href{https://link.springer.com/chapter/10.1007/978-3-030-05318-5_1}{Feurer and Hutter 2019}}. - -\end{frame} -%---------------------------------------------------------------------- -\begin{frame}[c]{Further extensions} -\framesubtitle{Even more extensions} -Bayesian optimization has been extended to numerous scenarios: -\begin{itemize} - \item Multi-task, Multi-fidelity and Meta-learning $\rightarrow$ separate lecture - \item Multi-objective Bayesian optimization $\rightarrow$ separate lecture - \item Bayesian optimization with safety guarantees~\lit{\href{http://proceedings.mlr.press/v37/sui15.pdf}{Sui et al. 2015}} - \item Directly optimizing for ensemble performance~\lit{\href{http://auai.org/uai2016/proceedings/papers/73.pdf}{Lévesque et al. 2016}} - \item Combination with local search methods~\lit{\href{https://www.researchgate.net/publication/241216681_Bayesian_Guided_Pattern_Search_for_Robust_Local_Optimization}{Taddy et al. 2009}} - \item Can optimize anything that can be described by a kernel, such as neural network architectures~\lit{\href{https://papers.nips.cc/paper/7472-neural-architecture-search-with-bayesian-optimisation-and-optimal-transport.pdf}{Kandasamy et al. 2018}} or a latent embedding, such as molecules~\lit{\href{https://arxiv.org/abs/1709.05501}{Griffiths et al. 2017}} - \item Many more, too many to mention -\end{itemize} - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} - -\begin{itemize} -% Categorical and Conditional -\item \emph{Discussion.} What would happen if you treat a categorical hyperparameter as a continuous hyperparameter, e.g. ${0, 0.5, 1}$ instead of ${A, B, C}$, in Bayesian optimization using a Gaussian Process? -\item \emph{Repetition.} What is the main idea behind additive modelling? Is this assumption realistic? -% Parallel -\item \emph{Repetition.} Which method can you use to impute values for outstanding evaluations. What are advantages and disadvantages of each method? -% Noise & Constrained -\item \emph{Discussion.} What are worst case scenarios that could happen if you ignore the noise during Bayesian optimization? -% Further -\end{itemize} -\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t05_high_dim_BO.pdf b/w06_hpo_bo/t05_high_dim_BO.pdf new file mode 100644 index 0000000..ce1ed2a Binary files /dev/null and b/w06_hpo_bo/t05_high_dim_BO.pdf differ diff --git a/w06_hpo_bo/t05_high_dim_BO.tex b/w06_hpo_bo/t05_high_dim_BO.tex new file mode 100644 index 0000000..cc4bab2 --- /dev/null +++ b/w06_hpo_bo/t05_high_dim_BO.tex @@ -0,0 +1,387 @@ +\videotitle{High-Dimensional Bayesian Optimization} + +%----------------------------------------------------------------------- +\begin{frame}[c]{High-Dimensional Bayesian Optimization: Motivation} + +\begin{itemize} +% \item Bayesian Optimization success stories: +% \begin{itemize} +% \item robotics, planning, recommendation, automatic algorithm configuration etc. +% \end{itemize} +% \pause + \item Issue: Standard BO works best on problems of moderate dimensions $d\leq20$ +% \pause + \begin{itemize} + \item Standard Gaussian processes do not tend to fit well in high dimensions + \item Maximizing the acquisition function is also computationally challenging + \end{itemize} +\medskip +\pause + + \item Possible solutions we will discuss: + \begin{itemize} + \item Different models, in particular random forests \lit{\href{https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf}{Hutter et al. 2011}} + \item Embedding into a low-dimensional space (REMBO) \lit{\href{https://ml.informatik.uni-freiburg.de/papers/16-JAIR-REMBO.pdf}{Wang et al. 2016}} + \item Additive models \lit{\href{http://proceedings.mlr.press/v37/kandasamy15.pdf}{Kandasamy et al. 2015}} + \end{itemize} +\end{itemize} + + +\end{frame} + + +%---------------------------------------------------------------------- + +\begin{frame}[c]{Low Effective Dimensionality} + +\begin{itemize} +% \item Good coverage of $\pcs$ is required to ensure that the global optimum is found +% \item The number of evaluations needed to cover $\pcs$ increases exponentially with dimensionality + \item Many optimization problems in practice have \alert{low effective dimensionality} + \begin{itemize} + \item E.g., HPO for deep neural networks + \lit{\href{http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf}{Bergstra et al. 2012}} + \item E.g., algorithm configuration for combinatorial optimization solvers \lit{\href{http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf}{Hutter et al. 2014}} + \end{itemize} +\pause +\medskip + \item \alert{Idea: Exploit low effective dimensionality to cover a lower-dimensional space well} +\end{itemize} + +\end{frame} + +%---------------------------------------------------------------------- + +%--------------------------------------------------------------------- +\begin{frame}[c]{Using Random Forests for High Dimensions / Low Effective Dimensionality} + + \myit{ + \item Random forests are \alert{automatic feature detectors} + \myit{ + \item They automatically select the important (axis-aligned) inputs + } + \medskip + \item Random forests have indeed be used effectively on spaces of more than 700 hyperparameters + \myit{ + \item In terms of computational efficiency, they do not pose a bottleneck + \item In terms of statistical efficiency, they scale more gracefully to high dimensions than GPs + } + } +\end{frame} +%---------------------------------------------------------------------- + +%---------------------------------------------------------------------- +\begin{frame}[c]{Random Embeddings for Exploiting Low Effective Dimensionality: Overview} + +Given a $D=2$ dimensional black-box function $\cost(x_{1},x_{2})$: +\begin{itemize} +\begin{columns}[T] +\begin{column}{0.45\linewidth} + + + \item Assume we know $\cost$ has only $d=1$ important dimensions, but we don't know which one it is. + \end{column} + \begin{column}{0.5\linewidth} + \begin{figure} + \includegraphics[width=0.5\textwidth]{images/highdim_images/Random embeddings in a nutshell1.png} + \end{figure} + \end{column} +\end{columns} + \pause + \begin{columns}[T] + \begin{column}{0.45\linewidth} + \vspace{-1em} + \item Subspace $x_1=x_2$ is guaranteed to include the optimum. + \end{column} + \begin{column}{0.5\linewidth} + \begin{figure} + \includegraphics[width=0.5\textwidth]{images/highdim_images/Random embeddings in a nutshell2.png} + \end{figure} + \end{column} +\end{columns} + \pause +\begin{columns} +\begin{column}{0.45\linewidth} + \vspace{-8em} + \item This idea applies to any $d$-dimensional linear subspace; allows scaling to arbitrary $D$ (e.g., $D=1$ billion) +\end{column} +\begin{column}{0.5\linewidth} + +\end{column} +\end{columns} +\end{itemize} + + +\end{frame} + +%---------------------------------------------------------------------- +\begin{frame}[c]{Random Embedding Bayesian Optimization (REMBO)} +\begin{columns}[T] +\begin{column}{0.01\textwidth} +\end{column} +\begin{column}{0.5\textwidth} +\begin{itemize} + \item Generate a \alert{random matrix $A \in \realnum^{D \times d}$} +% \item Choose a bounded region set $\obsspace\subset\realnum^d$ + \item \alert{Use BO to optimize $g(\conf)=\cost(\pmb{Ay})$} instead of high dimensional $\cost(\conf)$ +\end{itemize} + +\onslide<2->{ +\bigskip +\myblock{Theorem}{ +If the effective dimensionality of $c$ is at most d, then with probability 1, for any $\conf\in \realnum^{D}$, there exists a $\pmb{y}\in\realnum^d$ such that $c(\conf) = c(\pmb{Ay})$.} +} +\end{column} +\onslide<1->{ +\begin{column}{0.49\textwidth} +\begin{figure} +\includegraphics[width=0.8\textwidth]{images/highdim_images/Embedding.png} +\end{figure} +\end{column} +} + +\end{columns} +\end{frame} + +%---------------------------------------------------------------------- + +%---------------------------------------------------------------------- +\begin{frame}[c]{High Dimensional Bayesian Optimization: REMBO Pseudocode} + +\begin{center} +\begin{minipage}{0.85\textwidth} + +\begin{algorithm}[H] +% \SetAlgoLined + \setcounter{AlgoLine}{0} + \SetKwInOut{Require}{Require} + \SetKwInOut{Result}{Result} + + \Require{Search space $\pcs$, cost function $\cost$, acquisition function $\acq$, predictive model $\surro$, maximal number of function evaluations $\bobudget$} + \Result{Best observed configuration $\finconf$ according to $\iter[\bobudget]{\dataset}$ or $\surro$} + + \textcolor{blue}{Generate a random matrix $\pmb{A} \in \realnum^{D\times d}$}\; + +% \textcolor{blue}{Choose the bounded region set $\obsspace\subset\realnum^d$}\; + + $\iter[0]{\dataset} \leftarrow \varnothing$\; + + \For{$\bocount=1$ \KwTo $\bobudget$}{ + $\iter[\bocount]{\surro} \leftarrow$ fit predictive model on $\iter[\bocount-1]{\dataset}$\; + + \textcolor{blue}{$\pmb{y} \leftarrow \pmb{y} \in \argmax_{\pmb{y}\in\obsspace} \acq(\pmb{y}|\iter[\bocount-1]{\dataset}, \iter[\bocount]{\surro})$}\; + + Query $\cost(\textcolor{blue}{\pmb{A}\pmb{y}})$; + + $\iter[\bocount]{\dataset} \leftarrow \iter[\bocount-1]{\dataset} \cup \{\langle \textcolor{blue}{\pmb{A}\pmb{y}}, \cost(\textcolor{blue}{\pmb{A}\pmb{y}}) \rangle \}$\; + } + \caption*{REMBO: Bayesian Optimization with Random Embedding} +\end{algorithm} + +\end{minipage} +\end{center} + +\end{frame} +%---------------------------------------------------------------------- +\begin{frame}[c]{High Dimensional Bayesian Optimization} +\framesubtitle{Random Embedding Bayesian Optimization - Summary} +\begin{columns}[T] % align columns +\begin{column}{.48\textwidth} + + + \begin{block}{Advantages} + \begin{itemize} + \item Exploits low effective dimensionality + \item Allows scaling to arbitrarily high extrinsic dimensions + \item Applies to both continuous and categorical variables + \item Trivial modification of BO algorithm + \item Coordinate independent (invariant under rotations) + \end{itemize} + \end{block} +\pause +\end{column}% + +\hfill% + +\begin{column}{.48\textwidth} + + \begin{block}{Disadvantages} + \begin{itemize} + \item Sensitive to the definition of the bounded low dimensional constrained space $\obsspace$ + \item Assumes truly unimportant dimensions + %Limits a high dimensional function in a low dimensional embedding + %\item Sensitive to the effective dimension + \end{itemize} +\end{block} + +\end{column} +\end{columns} +\end{frame} + +%---------------------------------------------------------------------- +\begin{frame}[c]{High Dimensional Bayesian Optimization via Additive Models} + +\medskip +\begin{itemize} + \item Recall: + + \begin{itemize} + \item Standard GPs do not tend to fit well in high dimensions + \item Maximizing the acquisition function is also computationally challenging + \end{itemize} +\medskip +\pause + \item Idea: + \begin{itemize} + \item Assume additive structure of the objective function \lit{\href{http://proceedings.mlr.press/v37/kandasamy15.pdf}{Kandasamy et al. 2015}}: + \begin{equation*} + \alert{f(\conf)=f^{(1)}(\conf^{(1)})+f^{(2)}(\conf^{(2)})+...+f^{(M)}(\conf^{(M)})} + \end{equation*} + \item Model each $f^{(i)}$ by an individual GP +\medskip +\pause + \item If the decomposition does not overlap:\\ + can maximize acquisition function separately for each of the $f^{(i)}$ +\medskip +\pause + \item Best results for known decomposition, but also possible to learn decomposition from the data + \end{itemize} +\end{itemize} + +\end{frame} + +\iffalse +%---------------------------------------------------------------------- +\begin{frame}[c]{High Dimensional Bayesian Optimization} +\framesubtitle{Additive GP Models in a nutshell} + \begin{equation*} + f(\conf)=f^{(1)}(\conf^{(1)})+f^{(2)}(\conf^{(2)})+...+f^{(M)}(\conf^{(M)}) + \end{equation*} +\begin{itemize} +\begin{columns}[T] +\begin{column}{0.45\linewidth} + +\hspace{2em} + \item Key assumption: $f$ decomposes into lower-dimensional additive components $\conf^{(j)} \in \pcs^{(j)}, j \in \{1,..,M\}$ + \item The decompositions are disjoint $\conf^{(i)} \cap \conf^{(j)} = \varnothing$ + \pause + \item Each decomposition $f^{(j)}(\conf^{(j)})$ is modelled by an individual GP + \pause + \end{column} + \begin{column}{0.45\linewidth} + \begin{figure} + \includegraphics[width=0.7\textwidth]{images/highdim_images/additive-models.png} + \caption{Decomposition in two additive components (M=2) \\ Source: + \end{figure} + \end{column} +\end{columns} +\end{itemize} +\end{frame} + +%---------------------------------------------------------------------- + +\begin{frame}[c]{High Dimensional Bayesian Optimization} +\framesubtitle{Additive GP-UCB} +\begin{itemize} + + \item Idea: Represent acquisition function as sum of functions on decompositions: + \begin{equation*} + \acq_{t}(\conf) = \sum_{j}\acq_{t}^{(j)}(\conf^{(j)}) + \end{equation*} +\pause +\medskip + \item $\acq_{t}$ is maximized by maximizing each $\acq_{t}^{(j)}$ separately: + \begin{equation*} + \hat{\varphi}_{t}^{(j)}(\conf^{(j)}) = \mean_{t-1}^{(j)}(\conf^{(j)}) + \beta_{t}^{1/2}\stddev_{t-1}^{(j)}(\conf^{(j)}) + \end{equation*} + \item Authors have used UCB for this work, but other acquisition functions are possible, too. +\end{itemize} +\source{\href{http://proceedings.mlr.press/v37/kandasamy15.pdf}{Kandasamy et al. 2015}} +\end{frame} +%---------------------------------------------------------------------- + +\fi +% \begin{frame}[c]{High Dimensional Bayesian Optimization} +% \framesubtitle{Additive GP-UCB- Pseudocode} +% \begin{algorithm}[H] +% %\DontPrintSemicolon +% \LinesNumbered +% \SetAlgoLined +% \setcounter{AlgoLine}{0} +% \SetKwInOut{Input}{Input} + +% %\Input{Kernels $\kernel^{(1)},...,\kernel^{(M)}$, Decomposition $(\pcs^{(j)})_{j=1}^{M}$}\\ +% \Input{ Kernels $\kernel^{(1)},...,\kernel^{(M)}$, Decomposition $(\pcs^{(j)})_{j=1}^{M}$ +% $\dataset_{0}\leftarrow\varnothing$} +% \For{$j=1,...,M$, $(\mean_0^{(j)},\kernel_0^{(j)})\leftarrow(0,\kernel^{(j)})$.}{ +% \For{$j=1,...,M$,}{ +% $\confI{t}_{(j)}\leftarrow\argmax_{z\in\pcs^{(j)}}\mean_{t-1}^{(j)}(z) +\sqrt{\beta_{t}}\stddev_{t-1}^{(j)}(z)$;\ + +% $\confI{t}\leftarrow\bigcup_{j=1}^{M} \confI{t}_{(j)}$;\ + +% $\obs\leftarrow$ Query $\cost$ at $\confI{t}$;\ + +% $\dataset_{t}=\dataset_{t-1}\cup\{(\confI{t},\obs)\}$;\ + +% Perform Bayesian Optimization posterior updates conditioned on $\dataset_{t}$ to obtain $\mean_{t}^{(j)},\stddev_{t}^{(j)}$ for $j=1,...,M$;\ +% } +% } +% \caption{Add-GP-UCB} +% \end{algorithm} +% \end{frame} + + +%---------------------------------------------------------------------- +\begin{frame}[c]{High Dimensional Bayesian Optimization via Additive Models} +\begin{columns}[T] % align columns +\begin{column}{.48\textwidth} + + + \begin{block}{Advantages} + \begin{itemize} + \item Exploits low effective dimensionality + \item Scales GPs to high-dimensional parameter spaces + \item Regret is linearly dependent on the dimension D when $\cost$ is additive +% \item Add-GP-UCB applies to an additive kernel + \end{itemize} + \end{block} +\pause +\end{column}% + +\hfill% + +\begin{column}{.48\textwidth} + + \begin{block}{Disadvantages} + \begin{itemize} + \item Sensitive to the number of additive components + \item Restricted to an axis-aligned representation + \item Relies on assumption of additivity + %structural assumptions about the objective function + \end{itemize} +\end{block} + +\end{column} +\end{columns} +\end{frame} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} + +\begin{itemize} + +\item \alert{Repetition.} What is the main assumption behind REMBO? +\medskip + +\item \alert{Repetition.} What is the main assumption behind additive modelling? +\medskip + +\item \alert{Discussion.} Are these assumptions likely satisfied for tuning deep neural networks? +\medskip + +\item \alert{Discussion.} How do random forests help deal with high dimensions and low effective dimensionality? Can they also model additive structure? + + +\end{itemize} +\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t06_extensions.pdf b/w06_hpo_bo/t06_extensions.pdf new file mode 100644 index 0000000..379e072 Binary files /dev/null and b/w06_hpo_bo/t06_extensions.pdf differ diff --git a/w06_hpo_bo/t06_extensions.tex b/w06_hpo_bo/t06_extensions.tex new file mode 100644 index 0000000..b94943e --- /dev/null +++ b/w06_hpo_bo/t06_extensions.tex @@ -0,0 +1,412 @@ +\videotitle{Extensions to Bayesian Optimization} + +%----------------------------------------------------------------------- +\begin{frame}{Some More Extensions We Will Discuss} +\begin{block}{Standard Bayesian optimization problems} +\begin{itemize} + \item Continuous, smooth functions + \item Sequential optimization + \item Noise-free evaluations +% \item No constraints +\end{itemize} +\end{block} +% \pause +\begin{block}{Extensions} +\begin{itemize} + \item Structured search spaces: categorical \& conditional hyperparameters +% \item Disconnected search spaces + \item Parallel evaluations + \item Noisy evaluations + \item Optimization with constraints +% \item Multi-objective Bayesian optimization +\end{itemize} +\end{block} +\end{frame} + +%\begin{frame}[c]{Categorical and Conditional Parameters} +%\framesubtitle{Introduction} +%\begin{itemize} +% \item<+->{Our parameter configuration space $\pcs$ can possibly contain: +% \begin{itemize} +% \item<+->{Neural Network Architectures.} +% \item<+->{Model-specific parameters.} +% \item<+->{General optimization parameters.} +% \end{itemize} +% } +% \item<+->{Consider searching through such a space of parameters. Is every individual dimension of this search space- +% \begin{itemize} +% \item<+->{Continuous?} +% \item<+->{Relevant?} +% \end{itemize} +% } +%\end{itemize} +%\end{frame} +%----------------------------------------------------------------------- +%\begin{frame}[c]{Categorical and Conditional Parameters} +%\framesubtitle{Categorical Parameters} +%\begin{itemize} +% \item<+-> Parameters that draw values from a discrete domain instead of a real-valued domain. +% \item<+-> Mathematically, a parameter $\hyperparam$ is a categorical parameter if $\hyperparam\in P$, where $P=\{p_1, p_2, \dots\}$ is a set of finite, discrete values. +% \item<+-> Examples: +% \begin{itemize} +% \item<+-> For training a neural network, we may choose one flavor of SGD out of $\{Vanilla, \,RMSProp, \,Adam\}$. +% \item<+-> For a layer in a Multi-Layer Perceptron, we may choose one activation function out of $\{tanh, \,sigmoid, \,relu, \,unit\}$. +% \end{itemize} +% \item<+-> Categorical parameters present a challenge: inferring gradients is not possible for unordered categories! +% \item<+-> Another challenge: Each individual category, or possible value of a categorical parameter, contributes to the curse of dimensionality in naive search approaches. +%\end{itemize} +%\end{frame} +%%----------------------------------------------------------------------- +%\begin{frame}[c]{Categorical and Conditional Parameters} +%\framesubtitle{Hamming Distance Kernel} +%\begin{center} +%Placeholder - Describe Hamming Distance Kernel from Frank's PhD thesis, include visualization +%\end{center} +%\end{frame} +%----------------------------------------------------------------------- +%\begin{frame}[c]{Categorical and Conditional Parameters} +%\framesubtitle{Conditional Parameters} +%\begin{itemize} +% \item<+-> Some parameters in the search space are only relevant in the context of specific values of other parameters. +% \item<+-> For example, if we are training a Neural Network using SGD, the momentum parameter is only relevant when using a flavour of SGD that supports it, such as Adam, +% \item<+-> Such parameters can be used to define conditional dependencies between parameters +% \item<+-> These dependencies define active/inactive sub-spaces within the search space +% \item<+-> Conditional parameters are most recognizable in the context of categorical parameters, but they need not be categorical +% \item<+-> Similar to categorical parameters, inferring gradients is not possible due to the presence of active/inactive sub-spaces +%\end{itemize} +%\end{frame} +%----------------------------------------------------------------------- +% \begin{frame}[c]{Categorical and Conditional Hyperparameters} +% \framesubtitle{Structured Search Spaces} +% \begin{itemize} +% \item<+-> In HPO, we have prior knowledge about when some hyperparameters in the search space are completely irrelevant +% \item<+-> Naively searching over the entire search space while disregarding any conditional dependencies is inefficient +% \item<+-> We can impose a structure over the search space with the help of conditional dependencies between the various parameters to speed-up and optimize the HPO task +% \end{itemize} +% \end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Structured Search Spaces: Categorical \& Conditional Hyperparameters} +\begin{center} + \includegraphics[width=.9\linewidth, height=0.9\textheight, keepaspectratio=true]{w06_hpo_bo/images/categ_cond_params/Conditional Parameters AutoML Book.png} + \newline + Example of a structured search space (Source: Figure 5.1 of the \lit{\href{https://www.automl.org/wp-content/uploads/2019/05/AutoML_Book.pdf}{AutoML book}}) +\end{center} +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Structured Search Spaces: Categorical Hyperparameters} + +\begin{columns}[T] +\column{0.65\textwidth} + +\medskip +Properties of categorical hyperparameters: +\myit{ + \item \alert{Finite, discrete} set of values + \item \alert{No natural order} between values + \item Potentially different distances between values +} + + +\column{0.35\textwidth} +\vspace{0.5cm} +\includegraphics[width=1\textwidth]{w06_hpo_bo/images/categ_cond_params/categorical.png} +% +\end{columns} + +\pause +\vspace*{-0.4cm} +This has to be taken into account by the surrogate model: +% +\begin{itemize} + \item Random Forests \alert{natively} handle categorical inputs \lit{\href{https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf}{Hutter et al, 2011}} + \item \emph{One-hot} encoding provides a simple general solution + \item Gaussian Processes can use a (weighted) \alert{Hamming Distance Kernel} \lit{\href{https://www.cs.ubc.ca/~hutter/papers/Hutter09PhD.pdf}{Hutter 2009}}: +\vspace*{-0.2cm} +\begin{equation*} + \kernel_{\theta}(\conf_i, \conf_j) = \exp{\sum_{l=1}^d (-\theta \cdot \delta(\hyperparam_{i,l} \neq \hyperparam_{j,l}))} +\end{equation*} + +\vspace*{-0.2cm} +\item Neural networks can learn \alert{entity embeddings} for categorical inputs \lit{\href{https://arxiv.org/pdf/1604.06737.pdf}{Guc et al. 2016}} +\end{itemize} +% +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Structured Search Spaces: Conditional Hyperparameters} + + +\begin{columns}[T] +\column{0.65\textwidth} +\vspace*{0.2cm} +Conditional hyperparameters: +\myit{ + \item Are \alert{only relevant if} certain other hyperparameters take on certain values + \item \alert{Should be ignored} by the model \alert{if not active} +} + +\column{0.35\textwidth} +\vspace{0.5cm} +\includegraphics[width=1\textwidth]{w06_hpo_bo/images/categ_cond_params/conditional.png} +% +\end{columns} + +\pause +\vspace*{-0.2cm} +Modelling conditional hyperparameters: +\myit{ + \item Setting the values for inactive hyperparameter to a specific value (e.g. $0$) + \item Random Forests \lit{\href{https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf}{Hutter et al. 2011}} and Tree Parzen Estimators \lit{\href{http://papers.nips.cc/paper/4443-algorithms-for-hyper}{Bergstra et al. 2011}} can \alert{natively} handle conditional inputs + \item There exist \alert{several kernels for Gaussian Processes} to handle conditional inputs \lit{\href{https://arxiv.org/abs/1310.5738}{Hutter et al. 2013}} \lit{\href{https://www.etsmtl.ca/Unites-de-recherche/LIVIA/Recherche-et-innovation/Publications/Publications-2017/Levesque_ijcnn_2017.pdf}{Lévesque et al. 2017}} \lit{\href{http://proceedings.mlr.press/v70/jenatton17a.html}{Jenatton et al. 2017}} +} +% +\pause +\vspace{0.4cm} +Overall, structured search spaces are \alert{still an active research topic} and far from solved +\end{frame} + + +\begin{frame}[c]{Parallel Bayesian Optimization: Multi-point Acquisition Functions} + +\myit{ + \item Often, we have many parallel compute units + \item How should these be exploited in (the typically inherently sequential) Bayesian optimization? +\medskip +\pause + \item To select a batch of $q$ points in parallel, we need to compute the multi-point acquisition function. + E.g., for expected improvement: + \begin{equation*} + q\text{-EI}(\conf_{1, \dots, q}) = \E \left[ \cost(\incumbent) - \min_{i=1, \dots, q} \surro(\conf_i) \right] + \end{equation*} +\pause +\medskip + \item For EI and KG, this requires \emph{expensive-to-compute} q-dimensional Gaussian cumulative distributions \lit{\href{https://hal.archives-ouvertes.fr/hal-00260579/document}{Ginsbourger et al. 2007}}, \lit{\href{https://arxiv.org/pdf/1606.04414v4.pdf}{Wu et al. 2018}}, \lit{\href{https://arxiv.org/pdf/1602.05149.pdf}{Wang et al. 2019}} + \pause + \item Nevertheless, multi-point acquisition functions can be optimized efficiently with gradient descent via the reparameterization trick \lit{\href{https://papers.nips.cc/paper/8194-maximizing-acquisition-functions-for-bayesian-optimization}{Wilson et al. 2018}} +} +\end{frame} +%---------------------------------------------------------------------- +\begin{frame}[c]{Asynchronous Parallel Bayesian Optimization with Pending Evaluations} + +\myit{ + \item In practice, typically, not all function evaluations take the same amount of time + \myit{ + \item Thus, we need to select \alert{some new points} while we're still waiting for \alert{pending evaluations} at other points + } +\pause +\bigskip +% + \item Simple solution: \alert{hallucinate observations for pending evaluations}, and use otherwise standard methods: + \pause + \myit{ + \item \alert{Constant Liar}: Choose a fixed value (constant) \lit{\href{http://www.cs.ubc.ca/labs/beta/EARG/stack/2010_CI_Ginsbourger-ParallelKriging.pdf}{Ginsbourger et al. 2010}} +\smallskip +\item \alert{Kriging Believer}: Use the current mean prediction (belief) \lit{\href{http://www.cs.ubc.ca/labs/beta/EARG/stack/2010_CI_Ginsbourger-ParallelKriging.pdf}{Ginsbourger et al. 2010}} +\smallskip +\item \alert{Monte Carlo Fantasies} + \pause + \myit{ + \item Sample pending evaluations from the model + \item Update copy of the model with these samples + \item Compute acquisition function under each updated copy + \item Define acquisition function as an average over these sampled acquisition functions +% (fantasies, more details on the next slide). + } + } +} + +\end{frame} +%---------------------------------------------------------------------- +\iffalse +\myframetop{Asynchronous Parallel Bayesian Optimization with Pending Evaluations}{ + % + Assume we have observed data $\left\{ \left\langle \bonextsample, \bonextobs \right\rangle \right\}^{N}_{\bocount = 1}$ and $J$ evaluations are pending $\left \{\conf_{j} \right \}^{J}_{j = 1}$. We can compute the expected mean function using the following integral: \pause + \begin{equation*} + \begin{aligned} + \Bar{\acq} \left( \conf; \left\{ \left\langle \bonextsample, \bonextobs \right\rangle \right\}^{N}_{\bocount = 1}, \left \{\conf_{j} \right \}^{J}_{j = 1} \right) = \pause + \int_{\mathbb{R}^J} \acq \left( \conf; \left\{ \left\langle \bonextsample, \bonextobs \right\rangle \right\}^{N}_{\bocount = 1}, \left \{ \left\langle \conf_j, \obs_j \right\rangle \right \}^{J}_{j=1} \right) \\ + p( \{ \obs_j \}^{J}_{j = 1} \rvert \{ \conf_j \}^{J}_{j = 1}, \left \left\{ \left\langle \bonextsample, \bonextobs \right\rangle \right\}^{N}_{\bocount = 1} )d\obs_1 \dots d\obs_J + \end{aligned} + \end{equation*} + % + \vspace{-0.8cm} + \begin{columns} + \column{0.6\textwidth} + \only<4->{ + \begin{enumerate} + \only<4->{\item Evaluated observations: $\left \{\conf_1, \conf_3, \conf_4 \right \}$, pending: $\left \{\conf_2, \conf_5 \right \}$.} + \only<5->{\item Fit a model for each possible realization of $\left \{\cost(\conf_2), \cost(\conf_5) \right \}$.} + \only<6->{\item Calculate acquisition function for each model.} + \only<7->{\item Integrate all acquisition functions over $\conf$.} + \end{enumerate} + } + + \column{0.4\textwidth} + \only<4-5>{ + \begin{figure} + \centering + \includegraphics[width=0.8\textwidth]{w06_hpo_bo/images/parallel/parallel_a.jpg} + \end{figure} + } + \only<6>{ + \begin{figure} + \centering + \includegraphics[width=0.8\textwidth]{w06_hpo_bo/images/parallel/parallel_b.jpg} + \end{figure} + } + \only<7->{ + \begin{figure} + \centering + \includegraphics[width=0.8\textwidth]{w06_hpo_bo/images/parallel/parallel_c.jpg} + \end{figure} + } + \end{columns} + + \source{\href{https://papers.nips.cc/paper/4522-practical-bayesian-optimization-of-machine-learning-algorithms.pdf}{Snoek et al. 2012}} + +\vspace*{-0.4cm} +% \notefh{As I mentioned in class, I think the EI plot is wrong; it needs to be zero at the points where fantasies are being evaluated. Can you code this up and make a proper plot this week? Otherwise, I would drop this slide.} + +} +\fi +% +%\begin{frame} +%\begin{itemize} +% \item <+-> Utilize tractable properties of GP to get Monte Carlo estimates of %acquisition function under different results from pending function evaluations. \pause +% \item <+-> Consider the case where $N$ evaluations have completed, with data $\left \{\bonextsample, \bonextobs \right \}^{N}_{\bocount = 1}$ and $J$ evaluations are pending $\left \{\conf_{j} \right \}^{J}_{j = 1}$: \pause +% \begin{equation*} +% \begin{aligned} +% \hat{\acq} ( \conf; \left \{ \bonextsample, \bonextobs \right \}, \left \{ \conf_j \right \} ) = \pause +% \int_{\mathbb{R}^J} \pause \acq ( \conf; \left \{ \bonextsample, %\bonextobs \right \}, \left \{ \conf_j, \obs_j \right \} ) \\ \pause +% p(\left \{ \obs_j \right \}^{J}_{j = 1} \rvert \left \{ \conf_j \right \}^{J}_{j = 1}, \left \{ \bonextsample, \bonextobs \right \}^{N}_{\bocount=1} )d\obs_1 \dots d\obs_J +% \end{aligned} +% \end{equation*} +%\end{itemize} + +%\source{\href{https://csc2541-f17.github.io/}{Scalable and Flexible Models of Uncertainty, University of Toronto}} + +%\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Noisy Evaluations} + + \begin{columns}[T] + + \column{0.6\textwidth} + + \myit{ + \item The probabilistic model natively supports Gaussian noise + \myit{ + \item A good hyperprior for the noise variance might be needed to robustly determine the right size of the noise + \item Noise that is not Gaussian (and not Student t \lit{\href{https://www.asc.ohio-state.edu/santner.1/TJS-BJW-WIN/master-driver.pdf}{Santner et al, 2014}}) would require approximations + } + } + + \column{0.4\textwidth} + \includegraphics[width=\textwidth]{images/extensions/BO_Loop_Noisy3.png} + + \end{columns} + % Given noisy evaluations, GP regression proceeds similarly to the noiseless case by adding the variance to the diagonal of the covariance matrix. + % \vspace{-0.2cm} + % \item KG and ES directly handle noisy function evaluations~\lit{\href{https://arxiv.org/abs/1807.02811}{Frazier 2018}}. + % \vspace{-0.2cm} + +\pause +\bigskip + \myit{ + \item The acquisition function might need adaptation + \myit{ + \item LBC, TS, ES, and KG, are not affected + \item PI and EI are based on an the cost $\cost_{inc}$ of the incumbent; it is unclear how to compute this + \begin{itemize} + \item Uncertainty about which point is the current incumbent + \item Uncertainty about the costs $\cost(\conf_i)$. + \item \alert{Noisy Expected Improvement}~\lit{\href{https://arxiv.org/abs/1706.07094}{Letham et al. 2019}} extends regular EI by integrating over the predictive posterior of the model using Monte Carlo + \end{itemize} + } + % Computing EI with observation noise is challenging: + % \vspace{-0.2cm} +} + + +%\begin{itemize} +% \item Noisy Expected Improvement~\lit{\href{https://arxiv.org/abs/1706.07094}{Letham et al. 2019}} extends the regular Expected Improvement by integrating over the predictive posterior of the model: +% % There's an additional paper by Gramacy and Lee from 2011 which does a more complicated treatment of noise in EI +% \begin{equation*} +% \acq_{NEI}(\conf|\dataset)=\int_{\surro}\acq_{EI}(\conf|\surro)p(\surro|\dataset)\text{d}\surro +% \end{equation*} +% \vspace{-0.2cm} +% \begin{itemize} +% \item Compute with Monte Carlo Integration. +% \item Each sample from the model posterior has its own incumbent $\incumbent[\bocount-1]$. +% \end{itemize} +% \end{itemize} +\end{frame} + +%---------------------------------------------------------------------- + +\begin{frame}[c]{Bayesian Optimization with Constraints} + +\begin{columns}[T] + +\column{0.6\textwidth} + +Three types of constraints +% \notefh{Four? There are 3 listed here. Also, from the description here I don't see a difference between unknown and hidden constraints, just that in the unknown case you're trying to model them!? @Matthias: this might be most efficient to discuss on the phone.} +\begin{small} +\begin{enumerate} + \item Known constraints: can be accounted for when optimizing $\acq$ + %\item Policy constraints: function value is observed, but deemed forbidden~\lit{\href{https://www.soe.ucsc.edu/sites/default/files/technical-reports/UCSC-SOE-10-10.pdf}{Lee and Gramacy 2010}} + \item Hidden constraints: no function value is observed due to a failed function evaluation~\lit{\href{https://www.soe.ucsc.edu/sites/default/files/technical-reports/UCSC-SOE-10-10.pdf}{Lee and Gramacy 2010}} + \item Unknown constraints: there's an additional, but unknown constraint function, for example the memory used, which can be observed and modeled +\end{enumerate} +\end{small} + +\column{0.4\textwidth} +\includegraphics[width=0.9\textwidth]{images/extensions/notebooks_constrained_bo_4_0.png}\\ + \footnotesize{Hidden constraints. Image source: \lit{\href{https://gpflowopt.readthedocs.io/en/latest/notebooks/constrained_bo.html}{GPFlowOpt Tutorial, Apache 2 License}}} + +\end{columns} + +Most general solution: \emph{Expected Constrained Improvement}~\lit{\href{https://www.soe.ucsc.edu/sites/default/files/technical-reports/UCSC-SOE-10-10.pdf}{Lee and Gramacy 2010}}: +\vspace{-0.1cm} +\begin{equation} + ECI(\conf) = EI(\conf)h(\conf), +\end{equation} +\vspace{-0.1cm} +where $h(\conf)$ is the probability that $\conf$ is a valid configuration. + +\vspace{0.1cm} +Further literature in \lit{\href{https://arxiv.org/abs/1807.02811}{Frazier 2018}} and \lit{\href{https://link.springer.com/chapter/10.1007/978-3-030-05318-5_1}{Feurer and Hutter 2019}}. + +\end{frame} +%---------------------------------------------------------------------- +\begin{frame}[c]{Further extensions} +\framesubtitle{Even more extensions} +Bayesian optimization has been extended to numerous scenarios: +\begin{itemize} + \item Multi-task, Multi-fidelity and Meta-learning $\rightarrow$ separate lecture + \item Multi-objective Bayesian optimization $\rightarrow$ separate lecture + \item Bayesian optimization with safety guarantees~\lit{\href{http://proceedings.mlr.press/v37/sui15.pdf}{Sui et al. 2015}} + \item Directly optimizing for ensemble performance~\lit{\href{http://auai.org/uai2016/proceedings/papers/73.pdf}{Lévesque et al. 2016}} + \item Combination with local search methods~\lit{\href{https://www.researchgate.net/publication/241216681_Bayesian_Guided_Pattern_Search_for_Robust_Local_Optimization}{Taddy et al. 2009}}~\lit{\href{https://papers.nips.cc/paper/8788-scalable-global-optimization-via-local-bayesian-optimization.pdf}{Eriksson et al. 2019}} + \item Optimization of arbitrary spaces that can be described by a kernel (e.g., neural network architectures~\lit{\href{https://papers.nips.cc/paper/7472-neural-architecture-search-with-bayesian-optimisation-and-optimal-transport.pdf}{Kandasamy et al. 2018}} or + %a latent embedding, such as + molecules~\lit{\href{https://arxiv.org/abs/1709.05501}{Griffiths et al. 2017}}) + \item Many more (too many to mention) +\end{itemize} + +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} + +\begin{itemize} +% Categorical and Conditional +\item \alert{Discussion.} What would happen if you treat a categorical hyperparameter as continuous (e.g., $\{A, B, C\}$ as $\{0, 0.5, 1\}$), in Bayesian optimization using a Gaussian Process? +\medskip +% Parallel +\item \alert{Repetition.} Which methods can you use to impute values for outstanding evaluations? What are advantages and disadvantages of each method? +\medskip +% Noise & Constrained +\item \alert{Discussion.} What are worst case scenarios that could happen if you ignore the noise during Bayesian optimization? +% Further +\end{itemize} +\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t06_tpe.tex b/w06_hpo_bo/t06_tpe.tex deleted file mode 100644 index db6dd87..0000000 --- a/w06_hpo_bo/t06_tpe.tex +++ /dev/null @@ -1,159 +0,0 @@ -\section{Tree-Parzen Estimator} -%----------------------------------------------------------------------- -\begin{frame}[c]{Tree-Parzen Estimator} -\framesubtitle{Introduction} - -\begin{itemize} - \item Instead of modelling $ - P(\func \vert \dataset_{1:\bocount}) \propto P(\dataset_{1:\bocount} \vert \func) \times P(\func)$, TPE models $P(\dataset_{1:\bocount} \vert \func)$ - %\item \emph{Recall.} Bayesian optimization approach: - % \begin{equation*} - % P(\obs \vert \conf) \propto P(\conf \vert \obs) \times P(\obs) - % \end{equation*} - \item TPE then defines two such distributions, $l$ and $g$: - \begin{equation*} - P(\conf \vert \obs) = - \begin{cases} - l(\conf) \text{ if } \obs < \obs^*\\ - g(\conf) \text{ otherwise} - \end{cases} - \end{equation*} - where $\obs^*$ is an empirical threshold for a well-performing configuration (e.g., a $\gamma$ percentile of all observed $\obs$ in $D$) - \item Distributions are approximated by kernel density estimators (Parzen estimators) - \item Optimizing $l(\conf)/g(\conf)$ is equivalent to optimizing \emph{expected improvement} as the acquisition function in Bayesian optimization - \item The \emph{tree} in the name is there because TPE can handle tree-structured search spaces - -\end{itemize} - -\source{\href{https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf}{Bergstra et al. 2011}} - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Tree-Parzen Estimator} -\framesubtitle{Pseudocode} - - -\begin{center} -\begin{minipage}{0.75\textwidth} -\begin{algorithm}[H] - %\DontPrintSemicolon - \SetAlgoLined - \setcounter{AlgoLine}{0} - \SetKwInOut{Require}{Require} - \SetKwInOut{Result}{Result} - \Require{Search space $\pcs$, - cost function $\cost$, - \textcolor{blue}{percentile $\gamma$}, - maximal number of function evaluations $\bobudget$} - \Result{Best observed configuration $\conf$ according to $\iter[\bobudget]{\dataset}$} - - $\iter[0]{\dataset} \leftarrow \varnothing$\; - - \For{$\bocount=1$ \KwTo $\bobudget$}{ - \textcolor{blue}{$\dataset_\text{good}, \dataset_\text{bad}$ $\leftarrow$ split $\iter[\bocount-1]{\dataset}$};\ - - \textcolor{blue}{$l(\conf)$, $g(\conf)$ $\leftarrow$ fit KDE on $\dataset_\text{good}$, $\dataset_\text{bad}$ respectively};\ - - \textcolor{blue}{$\pcs_\text{cand}$ $\leftarrow$ draw samples from $l$};\ - - \textcolor{blue}{$\bonextsample \leftarrow \bonextsample \in \argmax_{\conf \in \pcs_\text{cand}} l(\conf) / g(\conf)$};\ - - Query $\bonextobs$\; - - $\iter[\bocount]{\dataset} \leftarrow \iter[\bocount-1]{\dataset} \cup \{\langle \bonextsample, \bonextobs \rangle \}$\; - } - \caption{TPE loop} -\end{algorithm} -\end{minipage} -\end{center} - -\source{Bergstra et al. 2011} - -\end{frame} -%----------------------------------------------------------------------- -%----------------------------------------------------------------------- -\begin{frame}[c]{Tree-Parzen Estimator} -\framesubtitle{Example} -\onslide<1-> -\begin{figure} - \centering - \only<1>{\includegraphics[width=0.6\textwidth]{w07_hpo_grey_box/images/tpe/tpeiter_1_observations.png}} - \only<2>{\includegraphics[width=0.6\textwidth]{w07_hpo_grey_box/images/tpe/tpeiter_1_pdfs.png}} - \only<3>{\includegraphics[width=0.6\textwidth]{w07_hpo_grey_box/images/tpe/tpeiter_2_observations.png}} - \only<4>{\includegraphics[width=0.6\textwidth]{w07_hpo_grey_box/images/tpe/tpeiter_2_pdfs.png}} - \only<5>{\includegraphics[width=0.6\textwidth]{w07_hpo_grey_box/images/tpe/tpeiter_3_observations.png}} - \only<6>{\includegraphics[width=0.6\textwidth]{w07_hpo_grey_box/images/tpe/tpeiter_3_pdfs.png}} -\end{figure} -\centering - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Tree-Parzen Estimator} -\framesubtitle{Further Details} - -Remarks: - -\begin{itemize} - \item TPE models $p(\conf | \obs)$ - \begin{itemize} - \item we can multiply it with a prior to add expert knowledge - \end{itemize} - \smallskip - - \pause - - \item Performance of TPE depends on: - \begin{itemize} - \item setting of $\gamma$ to trade-off exploration and exploitation - \item bandwidth of the KDEs - \end{itemize} - - \pause - - \smallskip - - \smallskip - \item A successful tool implementing TPE is \lit{\href{https://github.com/hyperopt/hyperopt}{hyperopt}} -\end{itemize} - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Tree-Parzen Estimator} -\framesubtitle{Summary} - -\begin{columns}[T] % align columns -\begin{column}{.48\textwidth} - \begin{block}{Advantages} - \begin{itemize} - \item Efficient $O(N*d)$ - \item Parallelizable - \item Robust - \item Deal with complex search spaces with priors - \end{itemize} - \end{block} -\end{column}% - -\hfill% - -\pause - -\begin{column}{.48\textwidth} - \begin{block}{Disadvantages} - \begin{itemize} - \item Less sample-efficient than GPs - \end{itemize} - \end{block} -\end{column} -\end{columns} - -\end{frame} -%----------------------------------------------------------------------- -\begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} - -\begin{itemize} - \item \emph{Discussion.} Is TPE really Bayesian optimization? - \item \emph{Discussion.} How does $\gamma$ impact the optimization procedure? - \item \emph{Repetition.} Derive that optimizing $l(\conf) / g(\conf)$ is equivalent to optimizing Expected Improvement. -\end{itemize} - -\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t07_success_stories.tex b/w06_hpo_bo/t07_success_stories.tex deleted file mode 100644 index b4f1220..0000000 --- a/w06_hpo_bo/t07_success_stories.tex +++ /dev/null @@ -1,86 +0,0 @@ -%---------------------------------------------------------------------- -\section{Success Stories} -\begin{frame}[c]{Success Stories} -\framesubtitle{Spearmint} - -\small -\begin{itemize} - \item First successful open source Bayesian optimization implementation. - \item Was used to tune a neural network to state-of-the-art performance on CIFAR-10 in 2012. - \item Implements standard Bayesian optimization with MCMC integration of the acquisition function, asynchronous parallelism, input warping and constraints. - \item Startup based on Spearmint got acquired by Twitter in 2015. - \item Still heavily used and cited and available at \url{https://github.com/HIPS/spearmint}: - \begin{center} - \only{\includegraphics[width=0.7\linewidth, keepaspectratio=true]{w07_hpo_grey_box/images/success_stories/jsnoek_spearmint_git_stats.png}} - - \only{\includegraphics[width=0.7\linewidth, keepaspectratio=true]{w07_hpo_grey_box/images/success_stories/hips_spearmint_git_stats.png}} - - \only{\includegraphics[width=.5\linewidth, keepaspectratio=true]{w07_hpo_grey_box/images/success_stories/spearmint_alt_stats.png}} - \newline Google Scholar screenshot from 3rd March, 2020 - \end{center} -\end{itemize} -\end{frame} - -%----------------------------------------------------------------------- -\begin{frame}[c]{Success Stories} -\framesubtitle{Hyperopt} -\begin{itemize} - \item Hyperopt is another successful open source Bayesian optimization package. - \item Implements the TPE algorithm and supports asynchronous parallel evaluations. - \item Maintained since 2013. - \item Available at \url{https://github.com/hyperopt/hyperopt} -\end{itemize} -\vspace{1cm} -\includegraphics[width=\linewidth, height=\textheight, keepaspectratio=true]{w07_hpo_grey_box/images/success_stories/hyperopt_git_stats.png} - -\vspace{1cm} -\hspace{2cm} -\lit{\href{https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf}{Bergstra et al. 2011}}, \lit{\href{http://proceedings.mlr.press/v28/bergstra13.pdf}{Bergstra et al., 2013}}, \lit{\href{http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.704.3494&rep=rep1&type=pdf}{Bergstra et al., 2013}}, \lit{\href{https://iopscience.iop.org/article/10.1088/1749-4699/8/1/014008/ampdf}{Bergstra et al., 2015}} - -\end{frame} - -%----------------------------------------------------------------------- -\begin{frame}[c]{Success Stories} -\framesubtitle{AlphaGo} -\begin{itemize} - \item During the development of AlphaGo, its many hyperparameters -were tuned with Bayesian optimization multiple times. - \item This automatic tuning process resulted in substantial improvements in playing strength. For example, prior to the match with Lee Sedol, we tuned the latest AlphaGo agent and this improved its win-rate from 50\% to 66.5\% in self-play games. This tuned version was deployed in the final match. - \item Of course, since we tuned AlphaGo many times during its development cycle, the compounded contribution was even higher than this percentage. -\end{itemize} -\vspace{1cm} -\hspace{12cm}\lit{\href{https://arxiv.org/abs/1812.06855}{Chen et al.}} -\end{frame} - -%----------------------------------------------------------------------- -\begin{frame}[c]{Success Stories} -\framesubtitle{Company usage} -\begin{itemize} - \item SIGOPT: startup offering Bayesian optimization as a service. - \item Facebook provides an open source Bayesian optimization package \lit{\href{https://botorch.org/}{BoTorch}}. - \item Amazon provides an open source Bayesian optimization package \lit{\href{https://amzn.github.io/emukit/}{EmuKit}}. - \item Uber tunes algorithms for \emph{Uber Pool}, \emph{UberX} and \emph{Uber Eats} \lit{\href{http://mcqmc2016.stanford.edu/Frazier-Peter.pdf}{source}} - \item Many more, but less openly -\end{itemize} -\end{frame} - -%----------------------------------------------------------------------- -\begin{frame}[c]{Success Stories} -\framesubtitle{Auto-WEKA} -\begin{itemize} - \item Introduce Bayesian optimization for \emph{\textbf{C}ombined \textbf{A}lgorithm \textbf{S}election and \textbf{H}yperparameter optimization} problem (CASH problem). - \begin{itemize} - \item Each configuration $\conf$ comprises a choice of algorithm $A^{(j)} \in \mathcal{A}$ - \item Hyperparameters of $A^{(j)}$ are conditional on $A^{(j)}$ being selected - \item $\argmin_{\conf} \frac{1}{k} \sum^k_{i=1} \loss(A^{(j)}_\lambda,\datasettrain,\datasetval)$ - \end{itemize} - \item 768 hyperparameters, 4 leves of conditionality - \item Based on WEKA and SMAC -\end{itemize} - -\vspace{1cm} -\hspace{12cm}\lit{\href{https://link.springer.com/chapter/10.1007/978-3-030-05318-5_4}{Kotthoff et al. 2019}} - -\end{frame} - -%----------------------------------------------------------------------- \ No newline at end of file diff --git a/w06_hpo_bo/t07_tpe.pdf b/w06_hpo_bo/t07_tpe.pdf new file mode 100644 index 0000000..6a9821a Binary files /dev/null and b/w06_hpo_bo/t07_tpe.pdf differ diff --git a/w06_hpo_bo/t07_tpe.tex b/w06_hpo_bo/t07_tpe.tex new file mode 100644 index 0000000..34a7b85 --- /dev/null +++ b/w06_hpo_bo/t07_tpe.tex @@ -0,0 +1,177 @@ +\videotitle{The Alternative Optimization Approach of the Tree-Parzen Estimator (TPE)} + +%----------------------------------------------------------------------- +\begin{frame}[c]{Overview of TPE \litw{\href{https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf}{Bergstra et al. 2011}}} + +\begin{itemize} + \item Standard Bayesian optimization models the probability $p(y \mid \conf)$ of observations $y$ given configurations $\conf$ + \item Instead, \alert{TPE fits kernel density estimators (KDEs) $l(\conf \mid y \le \gamma)$ and $g(\conf \mid y \le \gamma)$} + \myit{ + \item These KDEs are for ``good configurations'' (leading to objective function values below a threshold $\gamma$) and ``bad configurations'' + \item By default, $\gamma$ is set to the 15\% quantile of the observations + } + %\item \emph{Recall.} Bayesian optimization approach: + % \begin{equation*} + % P(\obs \vert \conf) \propto P(\conf \vert \obs) \times P(\obs) + % \end{equation*} + %\item TPE then defines two such distributions, $l$ and $g$: + % \begin{equation*} + % P(\conf \vert \obs) = + % \begin{cases} + % l(\conf) \text{ if } \obs < \obs^*\\ + % g(\conf) \text{ otherwise} + % \end{cases} + % \end{equation*} + %where $\obs^*$ is an empirical threshold for a well-performing configuration (e.g., a $\gamma$ percentile of all observed $\obs$ in $D$). + %\item Distributions are approximated by kernel density estimators (Parzen estimators) +\medskip +\pause + \item Optimizing $l(\conf)/g(\conf)$ is equivalent to optimizing standard expected improvement in Bayesian optimization \lit{\href{https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf}{Bergstra et al. 2011}} + +\medskip +\pause + \item Why is the technique called TPE? + \myit{ + \item The used KDEs are \alert{Parzen estimators} + \item TPE can handle \alert{tree}-structured search spaces + } + +\end{itemize} + + +\end{frame} + +%----------------------------------------------------------------------- +\begin{frame}[c]{TPE Example} + +\onslide<1-> +\begin{figure} + \centering + \only<1>{ + \includegraphics[width=0.6\textwidth]{images/tpe/tpeiter_1_observations.png} + } \only<2>{ + \includegraphics[width=0.6\textwidth]{images/tpe/tpeiter_1_pdfs.png} + }\only<3>{ + \includegraphics[width=0.6\textwidth]{images/tpe/tpeiter_2_observations.png} + }\only<4>{ + \includegraphics[width=0.6\textwidth]{images/tpe/tpeiter_2_pdfs.png} + }\only<5>{ + \includegraphics[width=0.6\textwidth]{images/tpe/tpeiter_3_observations.png} + }\only<6>{\includegraphics[width=0.6\textwidth]{images/tpe/tpeiter_3_pdfs.png} + } +\end{figure} +\centering + +\end{frame} +%----------------------------------------------------------------------- +%----------------------------------------------------------------------- +\begin{frame}[c]{TPE Pseudocode} + + +\begin{center} +\begin{minipage}{0.75\textwidth} +\begin{algorithm}[H] + %\DontPrintSemicolon + \SetAlgoLined + \setcounter{AlgoLine}{0} + \SetKwInOut{Require}{Require} + \SetKwInOut{Result}{Result} + \Require{Search space $\pcs$, + cost function $\cost$, + \textcolor{blue}{percentile $\gamma$}, + maximal number of function evaluations $\bobudget$} + \Result{Best observed configuration $\conf$ according to $\iter[\bobudget]{\dataset}$} + + $\iter[0]{\dataset} \leftarrow \varnothing$\; + + \For{$\bocount=1$ \KwTo $\bobudget$}{ + \textcolor{blue}{$\dataset_\text{good}, \dataset_\text{bad}$ $\leftarrow$ split $\iter[\bocount-1]{\dataset}$} according to quantile $\gamma$\ + + \textcolor{blue}{$l(\conf)$, $g(\conf)$ $\leftarrow$ fit KDE on $\dataset_\text{good}$, $\dataset_\text{bad}$ respectively}\ + + \textcolor{blue}{$\pcs_\text{cand}$ $\leftarrow$ draw samples from $l$};\ + + \textcolor{blue}{Select next query point: $\bonextsample \in \argmax_{\conf \in \pcs_\text{cand}} l(\conf) / g(\conf)$}\ + + Query $\bonextobs$\; + + $\iter[\bocount]{\dataset} \leftarrow \iter[\bocount-1]{\dataset} \cup \{\langle \bonextsample, \bonextobs \rangle \}$\; + } + \caption*{TPE loop} +\end{algorithm} +\end{minipage} +\end{center} + + +\end{frame} +%----------------------------------------------------------------------- + +\begin{frame}[c]{Further Details} + +Remarks: + +\begin{itemize} + \item TPE models $p(\conf | \obs)$ + \begin{itemize} + \item we can multiply it with a prior to add expert knowledge + \end{itemize} + \smallskip + + \pause + + \item Performance of TPE depends on: + \begin{itemize} + \item setting of $\gamma$ to trade-off exploration and exploitation + \item bandwidth of the KDEs + \end{itemize} + + \pause + + \smallskip + + \smallskip + \item A successful tool implementing TPE is Hyperopt \lit{\href{https://github.com/hyperopt/hyperopt}{Bergstra et al.}} +\end{itemize} + +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Summary} + +\begin{columns}[T] % align columns +\begin{column}{.48\textwidth} + \begin{block}{Advantages} + \begin{itemize} + \item Computationally efficient: $O(Nd)$ + \item Parallelizable + \item Robust + \item Can handle complex search spaces with priors + \end{itemize} + \end{block} +\end{column}% + +\hfill% + +\pause + +\begin{column}{.48\textwidth} + \begin{block}{Disadvantages} + \begin{itemize} + \item Less sample-efficient than GPs + \end{itemize} + \end{block} +\end{column} +\end{columns} + +\end{frame} +%----------------------------------------------------------------------- +\begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} + +\begin{itemize} + \item \alert{Disussion.} Is TPE really Bayesian optimization? +\medskip + \item \alert{Disussion.} How does $\gamma$ impact the optimization procedure? +\medskip + \item \alert{Derivation.} Go through the derivation that optimizing $l(\conf) / g(\conf)$ is equivalent to optimizing expected improvement; see Section 4.1 in \lit{\href{https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf}{Bergstra et al. 2011}}. +\end{itemize} + +\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t08_success_stories.pdf b/w06_hpo_bo/t08_success_stories.pdf new file mode 100644 index 0000000..fa194b7 Binary files /dev/null and b/w06_hpo_bo/t08_success_stories.pdf differ diff --git a/w06_hpo_bo/t08_success_stories.tex b/w06_hpo_bo/t08_success_stories.tex new file mode 100644 index 0000000..622d1fe --- /dev/null +++ b/w06_hpo_bo/t08_success_stories.tex @@ -0,0 +1,167 @@ +\videotitle{Success Stories} + +%---------------------------------------------------------------------- +\begin{frame}[c]{Spearmint \litw{\href{https://papers.nips.cc/paper/4522-practical-bayesian-optimization-of-machine-learning-algorithms.pdf}{Snoek et al. 2012}}} + +\small +\begin{itemize} + \item First successful open source Bayesian optimization implementation +% \item Was used to tune a neural network to state-of-the-art performance on CIFAR-10 in 2012 + \item Implements standard Bayesian optimization with MCMC integration of the acquisition function, asynchronous parallelism, + input warping + and constraints + \item \alert{Startup based on Spearmint got acquired by Twitter in 2015} + \item Still heavily used and cited and available at \url{https://github.com/HIPS/spearmint}: + \begin{center} + \only{\includegraphics[width=0.7\linewidth, keepaspectratio=true]{images/success_stories/jsnoek_spearmint_git_stats.png}} + + \only{\includegraphics[width=0.7\linewidth, keepaspectratio=true]{images/success_stories/hips_spearmint_git_stats.png}} + + \only{\includegraphics[width=.5\linewidth, keepaspectratio=true]{images/success_stories/spearmint_alt_stats.png}} +% \newline Google Scholar screenshot from 3rd March, 2020 + \end{center} +\end{itemize} +\end{frame} + +%----------------------------------------------------------------------- +\begin{frame}[c]{Hyperopt \litw{\href{https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf}{Bergstra et al. 2011}, \href{http://proceedings.mlr.press/v28/bergstra13.pdf}{Bergstra et al., 2013}, \href{http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.704.3494&rep=rep1&type=pdf}{Bergstra et al., 2013}, \href{https://iopscience.iop.org/article/10.1088/1749-4699/8/1/014008/ampdf}{Bergstra et al., 2015}}} +\begin{itemize} + \item Hyperopt is another successful open source Bayesian optimization package + \item Implements the TPE algorithm and supports asynchronous parallel evaluations + \item Maintained since 2013 + \item Available at \url{https://github.com/hyperopt/hyperopt} +\end{itemize} +\vspace{1cm} +\includegraphics[width=\linewidth, height=\textheight, keepaspectratio=true]{images/success_stories/hyperopt_git_stats.png} + +\vspace{1cm} +\hspace{2cm} + + +\end{frame} + +%--------------------------------------------------------------------- +\begin{frame}[c]{SMAC \litw{\href{https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf}{Hutter et al. 2011}}} + +\begin{itemize} + \item Standard BO tool based on random forests (RFs), reflecting the strengths of RFs in terms of \alert{scalability \& flexibility}: + \begin{itemize} + \item High dimensionality (low effective dimensionality) + \item Computational efficiency ($\rightarrow$ low overhead) + \item Supports continuous/categorical/conditional parameters + \item Supports non-standard noise (non-Gaussian, heteroscedastic) + \item Usability off the shelf (robustness towards model's own hyperparameters) + \end{itemize} + +\pause +\smallskip + \item SMAC also handles a more general problem: + $\argmin_{\conf\in\confs} \sum_{i=1}^N \cost(\conf,i)$ +\pause +\smallskip + \item Maintained since 2011, now available in version 3: \url{https://github.com/automl/SMAC3} + +\begin{columns} +\column{0.05\textwidth} +\column{0.45\textwidth} +~\\ +%\vspace*{0.1cm} +\includegraphics[width=1\linewidth, keepaspectratio=true]{images/success_stories/SMAC_citations.png} +\column{0.45\textwidth} +~\\ +~\\ +~\\ +\includegraphics[width=1\linewidth, keepaspectratio=true]{images/success_stories/SMAC_paper.png} +~\\ +\column{0.05\textwidth} +\end{columns} +\end{itemize} +\end{frame} +%---------------------------------------------------------------------- + +%----------------------------------------------------------------------- +\begin{frame}[c]{Tuning AlphaGo \litw{\href{https://arxiv.org/abs/1812.06855}{Chen et al. 2018}}} +\begin{itemize} + \item ``During the development of AlphaGo, \alert{its many hyperparameters were tuned with Bayesian optimization multiple times.}'' +\medskip + \item ``This automatic tuning process resulted in \alert{substantial improvements in playing strength}. For example, prior to the match with Lee Sedol, we tuned the latest AlphaGo agent and this \alert{improved its win-rate from 50\% to 66.5\%} in self-play games. \alert{This tuned version was deployed in the final match.} +\medskip + \item Of course, since we tuned AlphaGo many times during its development cycle, the \alert{compounded contribution was even higher than this percentage.} +\end{itemize} + +\end{frame} + +%----------------------------------------------------------------------- +\begin{frame}[c]{Company usage} +\begin{itemize} + \item SIGOPT: startup offering Bayesian optimization as a service + \item Facebook provides an open source Bayesian optimization package \lit{\href{https://botorch.org/}{BoTorch}} + \item Amazon provides an open source Bayesian optimization package \lit{\href{https://amzn.github.io/emukit/}{EmuKit}} + \item Uber tunes algorithms for \emph{Uber Pool}, \emph{UberX} and \emph{Uber Eats} \lit{\href{http://mcqmc2016.stanford.edu/Frazier-Peter.pdf}{source}} + \item Many more, but less openly +\end{itemize} +\end{frame} + +%----------------------------------------------------------------------- +\begin{frame}[c]{Auto-WEKA \litw{\href{https://dl.acm.org/doi/10.1145/2487575.2487629}{Thornton et al, 2013}, \href{http://www.jmlr.org/papers/volume18/16-261/16-261.pdf}{Kotthoff et al, 2017}, \href{https://link.springer.com/chapter/10.1007/978-3-030-05318-5_4}{Kotthoff et al. 2019}}} + + \myit{ + \item First \alert{general AutoML system}, carrying out \alert{\textbf{C}ombined \textbf{A}lgorithm \textbf{S}election and \textbf{H}yperparameter optimization} (CASH), jointly optimizing + \begin{itemize} + \item Choice of algorithm (out of 26 classifiers) + \item The algorithm's hyperparameters (up to 10) + \item Choice of preprocessing method and its hyperparameters + \item Choice of ensemble \& meta methods + \end{itemize} + } + +\begin{columns} +\column{0.0\textwidth} +\column{0.55\textwidth} + +\onslide<2-> +\vspace*{-0.4cm} + \myit{ + \item Parameterized WEKA \lit{\href{https://www.cs.waikato.ac.nz/ml/weka/Witten_et_al_2016_appendix.pdf}{Frank et al, 2016}}: \alert{768 hyperparameters}, 4 leves of conditionality + } + \onslide<3->{ + \myit{\item Optimized 10-fold cross-validation via SMAC + \lit{\href{https://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf}{Hutter et al, 2011}} + } + } +\column{0.45\textwidth} +\vspace*{-0.4cm} +\onslide<2->{ +\includegraphics[width=\linewidth, keepaspectratio=true]{w06_hpo_bo/images/success_stories/AutoWEKA_space.png} + } +\end{columns} + +\onslide<4-> + \myit{ + \vspace*{-0.9cm} + \item Results: + \myit{ + \item \alert{Better than an oracle of the 26 base classifiers} with default hyperparameters + \item \alert{100$\times$ faster than grid search} over base classifiers, and still better in 14/21 cases + \item Better than the only other applicable method TPE in \alert{19/21 cases} + } + \item Impact for practitioners: Auto-WEKA plugin was downloaded tens of thousands of times + } + +\end{frame} + +%----------------------------------------------------------------------- + +%----------------------------------------------------------------------- +\begin{frame}[c]{Questions to Answer for Yourself / Discuss with Friends} + +\begin{itemize} + \item \alert{Repetition.} List several success stories of Bayesian optimization +\medskip + \item \alert{Repetition.} List several prominent tools for Bayesian optimization +\medskip + \item \alert{Discussion.} Recall the algorithm selection problem; how does CASH relate to this (after all, it also has ``algorithm selection'' as part of its name)? + (Hint: they are quite different.) +\end{itemize} + +\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/t99_bibliography.pdf b/w06_hpo_bo/t99_bibliography.pdf new file mode 100644 index 0000000..55dc4b2 Binary files /dev/null and b/w06_hpo_bo/t99_bibliography.pdf differ diff --git a/w06_hpo_bo/t99_bibliography.tex b/w06_hpo_bo/t99_bibliography.tex index 6c35c1f..105c6e3 100644 --- a/w06_hpo_bo/t99_bibliography.tex +++ b/w06_hpo_bo/t99_bibliography.tex @@ -1,3 +1,4 @@ +\videotitle{Further Reading} \begin{frame}[c]{Further Reading} Tutorials on Bayesian Optimization @@ -5,8 +6,10 @@ \item A tutorial on Bayesian Optimization of Expensive Cost Functions, with Application to Active User Modeling and Hierarchical Reinforcement Learning \lit{\href{https://arxiv.org/abs/1012.2599}{Brochu at al. 2010}} \item Taking the Human out of the Loop: A Review of Bayesian Optimization \lit{\href{https://www.cs.princeton.edu/~rpa/pubs/shahriari2016loop.pdf}{Shahriari et al. 2016}} \item A Tutorial on Bayesian Optimization \lit{\href{https://arxiv.org/abs/1807.02811}{Frazier 2018}} - \item Hyperparameter optimization - \lit{\href{https://link.springer.com/chapter/10.1007/978-3-030-05318-5_1}{Feurer and Hutter 2019}} \end{itemize} -\end{frame} +\bigskip + +Survey on hyperparameter optimization: \lit{\href{https://link.springer.com/chapter/10.1007/978-3-030-05318-5_1}{Feurer and Hutter 2019}} + +\end{frame} \ No newline at end of file diff --git a/w06_hpo_bo/title_slide.tex b/w06_hpo_bo/title_slide.tex new file mode 100644 index 0000000..41b194f --- /dev/null +++ b/w06_hpo_bo/title_slide.tex @@ -0,0 +1,11 @@ +%%%%%%%%%%%%%%%%% Title slide -- only change title %%%%%%%%%%%%%%% +%\title{\lecturetitle} +%\subtitle{\weektitle} +%\\\vspace*{0.3cm} +% --------------------------------------------------------------------- +{ +\setbeamertemplate{footline}{} % remove footer on first slide + \frame[c]{ + \titlepage + } +} diff --git a/w06_hpo_bo/w06_hpo_bo.pdf b/w06_hpo_bo/w06_hpo_bo.pdf new file mode 100644 index 0000000..68fb9df Binary files /dev/null and b/w06_hpo_bo/w06_hpo_bo.pdf differ