% vim: set sts=4 et : \documentclass[11 pt,xcolor={dvipsnames,svgnames,x11names,table}]{beamer} \usetheme[ bullet=circle, % Other option: square bigpagenumber, % circled page number on lower right topline=true, % colored bar at the top of the frame shadow=false, % Shading for beamer blocks watermark=BG_lower, % png file for the watermark ]{Flip} \usepackage{bm} \usepackage{tikz,pgfplots,pgfplotstable} \usepgfplotslibrary{fillbetween} \usepackage{graphicx} % SOME COMMANDS THAT I FIND HANDY % \renewcommand{\tilde}{\widetilde} % dinky tildes look silly, dosn't work with fontspec \newcommand{\comment}[1]{\textcolor{comment}{\footnotesize{#1}\normalsize}} % comment mild \newcommand{\Comment}[1]{\textcolor{Comment}{\footnotesize{#1}\normalsize}} % comment bold \newcommand{\COMMENT}[1]{\textcolor{COMMENT}{\footnotesize{#1}\normalsize}} % comment crazy bold \newcommand{\Alert}[1]{\textcolor{Alert}{#1}} % louder alert \newcommand{\ALERT}[1]{\textcolor{ALERT}{#1}} % loudest alert %% "\alert" is already a beamer pre-defined \newcommand*{\Scale}[2][4]{\scalebox{#1}{$#2$}}% \graphicspath{{images/}} % Put all images in this directory. Avoids clutter. % suppress frame numbering for backup slides % you always need the appendix for this! \newcommand{\backupbegin}{ \newcounter{framenumberappendix} \setcounter{framenumberappendix}{\value{framenumber}} } \newcommand{\backupend}{ \addtocounter{framenumberappendix}{-\value{framenumber}} \addtocounter{framenumber}{\value{framenumberappendix}} } \begin{document} \tikzstyle{every picture}+=[remember picture] { \setbeamertemplate{sidebar right}{\llap{\includegraphics[width=\paperwidth,height=\paperheight]{bubble2}}} \begin{frame}[c]%{\phantom{title page}} \begin{center} \begin{center} \begin{columns} \begin{column}{0.9\textwidth} \flushright%\fontspec{Trebuchet MS} \bfseries \Huge {Function Minimization (I)} \end{column} \begin{column}{0.2\textwidth} %\includegraphics[width=\textwidth]{SHiP-2} \end{column} \end{columns} \end{center} \quad \vspace{3em} \begin{columns} \begin{column}{0.6\textwidth} \flushright \vspace{-1.8em} {%\fontspec{Trebuchet MS} \large Marcin Chrzaszcz, Danny van Dyk\\\vspace{-0.1em}\small \href{mailto:mchrzasz@cern.ch}{mchrzasz@cern.ch}, \href{mailto:danny.van.dyk@gmail.com}{danny.van.dyk@gmail.com}} \end{column} \begin{column}{0.4\textwidth} \includegraphics[height=1.3cm]{uzh-transp} \end{column} \end{columns} \vspace{1em} \vspace{0.5em} \textcolor{normal text.fg!50!Comment}{Numerical Methods, \\ 07. November, 2016} \end{center} \end{frame} } \begin{frame}{Plan for today} \begin{itemize} \item \alert{What types of function do we usually need to minimize?}\\ \begin{itemize} \item least-squares \item likelihood/posterior \item general non-linear problem \end{itemize} \vfill \item \alert{What information can we use?}\\ \begin{itemize} \item the target function \item its first derivatives \item its second derivatives \end{itemize} \end{itemize} \end{frame} \begin{frame}{General Minimization Problem} The general problem can be posited as follows: There exists a function \begin{equation*} f(a_1, \dots, a_N) \end{equation*} with real-valued parameters $\lbrace a_n \rbrace$.\\ \vfill We intend to find \begin{itemize} \item \alert{one}, \item \alert{any}, or \item \alert{all} \end{itemize} of its minima on a support $A$, \begin{equation*} A = \lbrace (a_1, \dots, a_N)\,|\,g(a_1, \dots, a_N) > 0\rbrace\,. \end{equation*} \end{frame} \begin{frame}{Example} Posterior with non-gaussian likelihood. For example, exponential likelihood. \end{frame} \begin{frame}{Limitations and Monte-Carlo Methods} In general we have no further information on $f$, i.e., we do not \emph{analytically} know any of its derivatives.\\ \vfill A very popular way to explore $f$ is by using Monte Carlo methods, e.g. plain random walks, Markov chain methods, or genetic algorithms. Most of these methds are very good at delineating \emph{local environments} around some/most/all(?) of the modes. As always, analytic knowledge of the problem will help. For example, symmetry relations among the parameters or (a)periodic boundary conditions should be exploited if possible. The specific Monte-Carlo methods are beyond the scope of these lectures. \end{frame} \begin{frame}{Simplex Method \hfill (aka Nelder-Mead Method)} We will now discuss the work-horse of minimization algorithms, the \alert{Simplex} method. While it has very good convergence properties, but a rather slow convergence rate.\\ \vfill The basic idea is as follows: \begin{itemize} \item[0] A set of parameter points must be provided as an initial value. If all else fails, random points can be used. \item[1] Order the points by their respetive size of $f$ \item[2] Compute the midpoint $\vec{a}_\text{mid}$ of all points except the worst, and reflect the worst point $\vec{a}_{N+1}$ at the midpoint yielding $\vec{a}_\text{ref}$. \begin{itemize} \item[a] If $f(\vec{a}_\text{ref})$ fulfills some minimization criteria, replace one of the existing simplex points with $\vec{a}_\text{ref}$. Continue at step 1. \item[b] Otherwise compute $\vec{a}_\text{contr}$ as a linear combination of the worst point of the simplex $\vec{a}_N$, and $\vec{a}_\text{ref}$. If $\vec{a}_\text{contr}$ is better than the worst point, replace the worst point. Continue at step 1. \item[c] Otherwise compress the the simplex by moving the points $\vec{x}_1$ to $\vec{x}_N$ closer to $\vec{x}_0$ on their respective connecting lines. Continue at step 1. \end{itemize} \item[3] If at any point the volume of the simplex falls below a given treshold, then stop. \end{itemize} \end{frame} \begin{frame}{Simplex Method (continued)} More details on the previous steps:\\ \vspace{-\medskipamount} \begin{overlayarea}{\textwidth}{.95\textwidth} \begin{itemize} \only<1-2>{ \item[2] Compute the midpoint as \begin{equation*} \vec{a}_\text{mid} = \frac{1}{N} \sum_{n=0}^{N - 1} \vec{a}_n \end{equation*} The reflection is computed as\hfill~[default: $\alpha = 1$] \begin{equation*} \vec{a}_\text{ref} = (1 + \alpha) \vec{a}_\text{mid} - \alpha \vec{a}_N \end{equation*} } \only<2>{ \item[2a.1] If $\vec{a}_\text{ref}$ is better than $\vec{a}_0$, then replace $\vec{a}_N$ with $\vec{a}_\text{ref}$. Continue with step 1. \item[2a.1] \alert{alternative}: Compute\hfill~\alert{[default: $\epsilon = 2$]} \begin{equation*} \vec{a}_\text{exp} = (1 + \epsilon) \vec{a}_\text{mid} - \epsilon \vec{a}_N \end{equation*} and use in step \alert{2a} the better of the two ($\vec{a}_\text{ref}$ or $\vec{a}_\text{exp}$). Continue with step 1. } \only<3-6>{ \item[2a.2] If $\vec{a}_\text{ref}$ is better than the second worst point $\vec{a}_{N-1}$, then replace the worst point by $\vec{a}_\text{ref}$. } \only<4-6>{ \item[2b] Let $\vec{a}_\text{tmp}$ be the better of $\vec{a}_\text{ref}$ and $\vec{a}_\text{N}$. Compute\hfill~\alert{[default: $\gamma = 1/2$]} \begin{equation*} \vec{a}_\text{contr} = \gamma \vec{a}_\text{mid} + (1 - \gamma) \vec{a}_\text{tmp}\,. \end{equation*} If $\vec{a}_\text{contr}$ is better than the worst point, replace the latter. Continue at step 1. } \only<5-6>{ \item[2c] Compress the entire simplex\hfill~\alert{[default: $\kappa = 1/2$]} \begin{equation*} \vec{a}_n = \sigma \vec{a}_0 + (1 - \sigma) \vec{a}_n\qquad \forall n=1,\dots,N \end{equation*} } \only<6>{ \item[0] The $N+1$ initial points must span a simplex. Therefore, care must be taken that they do not lie in a linear subspace of the simplex. This is similar to picking points in a plane when constructing a 3D volume. } \end{itemize} \end{overlayarea} \end{frame} \begin{frame}{Simplex Method: Properties} \begin{itemize} \item The method is very robust against problems such as overshooting. \item It will usually converge toward a close-by minimum. \item However, the convergence rate is smaller than for many other methods, which e.g. work on more specialised problems. \item No guarantee is given that the simplex method converges toward the global minimum. Possible relief comes in the form of several starting simplex obtained using Monte-Carlo methods. \end{itemize} \end{frame} \begin{frame}{Example} \end{frame} \begin{frame}{Least-squares Problems} The least-squares problem arises from the case of a Gaussian likelihood function if all uncertainties are equal. It is a very good example for understanding a non-linear problem through linearization.\\ \vfill The target function is called the residue $r(a_1, \dots, a_N)$ with $N$ parameters. \begin{equation*} r_k \equiv f(\vec{a}, \vec{x}_k) - y_k \end{equation*} \vfill Here $k$ denotes one of the $K$ possible coordinate on a curve, with (external, e.g. experimental) inputs $(\vec{x}_k, y_k)$.\\ \vfill The problem now aims to minimize \begin{equation*} \sum_k^K |r_k|^2 \end{equation*} \end{frame} \begin{frame}{Gauss-Newton Method} This iterative method requires partial derivatives of the resiudes $r_k$ with respect to the parameters $a_n$: \begin{equation*} r'_{k,n} \equiv \frac{\partial r(\vec{a}, \vec{x})}{\partial a_n}\Big|_{\vec{x} = \vec{x}_k} \end{equation*} \vfill The algorithm now involves the following quantities: \begin{align*} D & = \left(\begin{matrix} r'_{1,1} & r'_{1,2} & \dots & r'_{1,n}\\ r'_{2,1} & r'_{2,2} & \dots & r'_{2,n}\\ \vdots & \vdots & \ddots & \vdots \\ r'_{k,1} & r'_{k,2} & \dots & r'_{k,n} \end{matrix}\right) & \vec{r} & = (r_1, \dots, r_k)^T \end{align*} \end{frame} \begin{frame}{Gauss-Newton Method (cont'd)} \begin{itemize} \item[0] As always, we will be requiring a starting point $\vec{a}_0$ in parameter space.\\ \item[1] Update the current point: \begin{equation*} \vec{a}_{i + 1} = \vec{a}_i - (D^T\cdot D)^{-1} \cdot D \cdot \vec{r} \end{equation*} \item[2] If $||\vec{a}_{i + 1} - \vec{a}_i|| < T$, where $T$ is some a-prior threshold, we stop. Otherwise, continue with step 1. \end{itemize} Some comments \begin{itemize} \item The literature usually recommends to compute the auxiliary variable $\vec{s}$ via: \begin{equation*} (D^T \cdot D) \cdot \vec{s} = D^T \cdot \vec{r} \end{equation*} The above linear system of equations can be solved with known methods. \item Since $(D^T \cdot D)$ is symmetric, it is a good idea to use Cholesky decomposition. \end{itemize} \end{frame} \begin{frame}{Derivation} \url{https://en.wikipedia.org/wiki/Gauss-Newton_algorithm} \end{frame} \begin{frame}{Example} \url{https://en.wikipedia.org/wiki/Gauss-Newton_algorithm} \end{frame} \begin{frame}{Levenberg-Marquardt Method} The Levenberg-Marquardt method arises from a modification to the Gauss-Newton method. Trust region. Dampen: \begin{itemize} \item[0] As always, we will be requiring a starting point $\vec{a}_0$ in parameter space.\\ \item[1] Update the current point: \begin{equation*} \vec{a}_{i + 1} = \vec{a}_i - (D^T\cdot D {\color{red}\,+ \lambda I})^{-1} \cdot D \cdot \vec{r} \end{equation*} where $I$ is the unit matrix in $N \times N$ {\color{red} and $\lambda$ a real-valued parameter}. \item[2] If $||\vec{a}_{i + 1} - \vec{a}_i|| < T$, where $T$ is some a-prior threshold, we stop. Otherwise, continue with step 1. \end{itemize} The optimal choice of {\color{red}$\lambda$} is specific to the problem. \end{frame} \begin{frame} \end{frame} \backupbegin \begin{frame}\frametitle{Backup} \end{frame} \backupend \end{document}