From: François Fleuret Date: Sat, 24 Feb 2024 08:06:51 +0000 (+0100) Subject: Update. X-Git-Url: https://ant.fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=commitdiff_plain;h=119ad14a2072217edf3e2315154614815b72ccbd;p=tex.git Update. --- diff --git a/elbo.tex b/elbo.tex new file mode 100644 index 0000000..6875ddf --- /dev/null +++ b/elbo.tex @@ -0,0 +1,140 @@ +%% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*- + +%% Any copyright is dedicated to the Public Domain. +%% https://creativecommons.org/publicdomain/zero/1.0/ +%% Written by Francois Fleuret + +\documentclass[11pt,a4paper,oneside]{article} +\usepackage[paperheight=15cm,paperwidth=8cm,top=2mm,bottom=15mm,right=2mm,left=2mm]{geometry} +%\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{amsmath,amssymb,dsfont} +\usepackage[pdftex]{graphicx} +\usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue,citecolor=blue]{hyperref} +\usepackage{tikz} +\usetikzlibrary{arrows,arrows.meta,calc} +\usetikzlibrary{patterns,backgrounds} +\usetikzlibrary{positioning,fit} +\usetikzlibrary{shapes.geometric,shapes.multipart} +\usetikzlibrary{patterns.meta,decorations.pathreplacing,calligraphy} +\usetikzlibrary{tikzmark} +\usetikzlibrary{decorations.pathmorphing} +\usepackage[round]{natbib} +\usepackage[osf]{libertine} +\usepackage{microtype} +\usepackage{fancyvrb} + +\usepackage{mleftright} + +\newcommand{\setmuskip}[2]{#1=#2\relax} +\setmuskip{\thinmuskip}{1.5mu} % by default it is equal to 3 mu +\setmuskip{\medmuskip}{2mu} % by default it is equal to 4 mu +\setmuskip{\thickmuskip}{3.5mu} % by default it is equal to 5 mu + +\setlength{\parindent}{0cm} +\setlength{\parskip}{1ex} +%\renewcommand{\baselinestretch}{1.3} +%\setlength{\tabcolsep}{0pt} +%\renewcommand{\arraystretch}{1.0} + +\def\argmax{\operatornamewithlimits{argmax}} +\def\argmin{\operatornamewithlimits{argmin}} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\def\given{\,\middle\vert\,} +\def\proba{\operatorname{P}} +\newcommand{\seq}{{S}} +\newcommand{\expect}{\mathds{E}} +\newcommand{\variance}{\mathds{V}} +\newcommand{\empexpect}{\hat{\mathds{E}}} +\newcommand{\mutinf}{\mathds{I}} +\newcommand{\empmutinf}{\hat{\mathds{I}}} +\newcommand{\entropy}{\mathds{H}} +\newcommand{\empentropy}{\hat{\mathds{H}}} +\newcommand{\ganG}{\mathbf{G}} +\newcommand{\ganD}{\mathbf{D}} +\newcommand{\ganF}{\mathbf{F}} + +\newcommand{\dkl}{\mathds{D}_{\mathsf{KL}}} +\newcommand{\djs}{\mathds{D}_{\mathsf{JS}}} + +\allowdisplaybreaks[2] + +\newcommand*{\vertbar}{\rule[-1ex]{0.5pt}{2.5ex}} +\newcommand*{\horzbar}{\rule[.5ex]{2.5ex}{0.5pt}} + +\def\positionalencoding{\operatorname{pos-enc}} +\def\concat{\operatorname{concat}} +\def\crossentropy{\LL_{\operatorname{ce}}} + +\begin{document} + +\vspace*{0ex} + +\begin{center} +{\Large The Evidence Lower Bound} + +Fran\c cois Fleuret + +\today + +\vspace*{1ex} + +\end{center} + +Given a training set $x_1, \dots, x_N$ that follows an unknown +distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ to it, +maximizing +% +\[ +\sum_n \log \, p_\theta(x_n). +\] +% +If we do not have a analytical form of the marginal $p_\theta(x_n)$ +but only the expression of $p_\theta(x_n,z)$, we can get an estimate +of the marginal by sampling $z$ with any distribution $q$ +% +\begin{align*} +p_\theta(x_n) & = \int_z p_\theta(x_n,z) dz \\ + & = \int_z \frac{p_\theta(x_n,z)}{q(z)} q(z) dz \\ + & = \expect_{Z \sim q(z)} \left[\frac{p_\theta(x_n,Z)}{q(Z)}\right]. +\end{align*} +% +So if we wanted to maximize $p_\theta(x_n)$ alone, we could sample a +$Z$ with $q$ and maximize +% +\begin{equation*} +\frac{p_\theta(x_n,Z)}{q(Z)}.\label{eq:estimator} +\end{equation*} + +But we want to maximize $\sum_n \log \, p_\theta(x_n)$. If we use the +$\log$ of the previous expression, we can decompose its average value +as +\begin{align*} + & \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(x_n,Z)}{q(Z)} \right] \\ + & = \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(Z \mid x_n) \, p_\theta(x_n)}{q(Z)} \right] \\ + & = \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(Z \mid x_n)}{q(Z)} \right] + \log \, p_\theta(x_n) \\ + & = - \dkl(q(z) \, \| \, p_\theta(z \mid x_n)) + \log \, p_\theta(x_n). +\end{align*} +% +Hence this does not maximize $\log \, p_\theta(x_n)$ on average, but a +\emph{lower bound} of it, since the KL divergence is non-negative. And +since this maximization pushes that KL term down, it also aligns +$p_\theta(z \mid x_n)$ and $q(z)$, and we may get a worse +$p_\theta(x_n)$ to bring $p_\theta(z \mid x_n)$ closer to $q(z)$. + +However, all this analysis is still valid if $q$ is a parameterized +function $q_\alpha(z \mid x_n)$ of $x_n$. In that case, if we optimize +$\theta$ and $\alpha$ to maximize +% +\[ +\expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n,Z)}{q_\alpha(Z \mid x_n)} \right], +\] +% +it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid +x_n)$ close to $p_\theta(z \mid x_n)$. + + +\end{document}