Newer
Older
Lecture_repo / Lectures_my / NumMet / 2016 / Lecture12 / lecture12.tex
@Danny van Dyk Danny van Dyk on 20 Nov 2016 12 KB Add preliminary lecture 12
% vim: set sts=4 et :
\documentclass[11 pt,xcolor={dvipsnames,svgnames,x11names,table}]{beamer}

\usetheme[
	bullet=circle,		% Other option: square
	bigpagenumber,		% circled page number on lower right
	topline=true,			% colored bar at the top of the frame 
	shadow=false,			% Shading for beamer blocks
	watermark=BG_lower,	% png file for the watermark
	]{Flip}

\usepackage{bm}
\usepackage{tikz,pgfplots,pgfplotstable}
\usepgfplotslibrary{fillbetween}
\usepackage{graphicx}

% SOME COMMANDS THAT I FIND HANDY
% \renewcommand{\tilde}{\widetilde} % dinky tildes look silly, dosn't work with fontspec
\newcommand{\comment}[1]{\textcolor{comment}{\footnotesize{#1}\normalsize}} % comment mild
\newcommand{\Comment}[1]{\textcolor{Comment}{\footnotesize{#1}\normalsize}} % comment bold
\newcommand{\COMMENT}[1]{\textcolor{COMMENT}{\footnotesize{#1}\normalsize}} % comment crazy bold
\newcommand{\Alert}[1]{\textcolor{Alert}{#1}} % louder alert
\newcommand{\ALERT}[1]{\textcolor{ALERT}{#1}} % loudest alert
%% "\alert" is already a beamer pre-defined
\newcommand*{\Scale}[2][4]{\scalebox{#1}{$#2$}}%

\graphicspath{{images/}}	% Put all images in this directory. Avoids clutter.

% suppress frame numbering for backup slides
% you always need the appendix for this!
\newcommand{\backupbegin}{
	\newcounter{framenumberappendix}
	\setcounter{framenumberappendix}{\value{framenumber}}
}
\newcommand{\backupend}{
	\addtocounter{framenumberappendix}{-\value{framenumber}}
	\addtocounter{framenumber}{\value{framenumberappendix}} 
}

\begin{document}
\tikzstyle{every picture}+=[remember picture]

{
\setbeamertemplate{sidebar right}{\llap{\includegraphics[width=\paperwidth,height=\paperheight]{bubble2}}}
\begin{frame}[c]%{\phantom{title page}} 
\begin{center}
\begin{center}
	\begin{columns}
		\begin{column}{0.9\textwidth}
			\flushright%\fontspec{Trebuchet MS}
			\bfseries \Huge {Function Minimization (I)}
		\end{column}
		\begin{column}{0.2\textwidth}
		  %\includegraphics[width=\textwidth]{SHiP-2}
		\end{column}
	\end{columns}
\end{center}
	\quad
	\vspace{3em}
\begin{columns}
\begin{column}{0.6\textwidth}
\flushright \vspace{-1.8em} {%\fontspec{Trebuchet MS}
    \large Marcin Chrzaszcz, Danny van Dyk\\\vspace{-0.1em}\small \href{mailto:mchrzasz@cern.ch}{mchrzasz@cern.ch}, \href{mailto:danny.van.dyk@gmail.com}{danny.van.dyk@gmail.com}}

\end{column}
\begin{column}{0.4\textwidth}
\includegraphics[height=1.3cm]{uzh-transp}
\end{column}
\end{columns}

\vspace{1em}
\vspace{0.5em}
	\textcolor{normal text.fg!50!Comment}{Numerical Methods, \\ 07. November, 2016}
\end{center}
\end{frame}
}

\begin{frame}{Plan for today}
    \begin{itemize}
        \item \alert{What types of function do we usually need to minimize?}\\
            \begin{itemize}
                \item least-squares
                \item likelihood/posterior
                \item general non-linear problem
            \end{itemize}
        \vfill
        \item \alert{What information can we use?}\\
            \begin{itemize}
                \item the target function
                \item its first derivatives
                \item its second derivatives
            \end{itemize}
    \end{itemize}
\end{frame}

\begin{frame}{General Minimization Problem}
    The general problem can be posited as follows: There exists
    a function
    \begin{equation*}
        f(a_1, \dots, a_N)
    \end{equation*}
    with real-valued parameters $\lbrace a_n \rbrace$.\\
    \vfill
    We intend to find
    \begin{itemize}
        \item \alert{one},
        \item \alert{any}, or
        \item \alert{all}
    \end{itemize}
    of its minima on a support $A$,
    \begin{equation*}
        A = \lbrace (a_1, \dots, a_N)\,|\,g(a_1, \dots, a_N) > 0\rbrace\,.
    \end{equation*}
\end{frame}

\begin{frame}{Example}
    Posterior with non-gaussian likelihood. For example, exponential likelihood.
\end{frame}

\begin{frame}{Limitations and Monte-Carlo Methods}
    In general we have no further information on $f$, i.e., we do not
    \emph{analytically} know any of its derivatives.\\
    \vfill
    A very popular way to explore $f$ is by using Monte Carlo methods,
    e.g. plain random walks, Markov chain methods, or genetic algorithms.
    Most of these methds are very good at delineating \emph{local environments}
    around some/most/all(?) of the modes. As always, analytic knowledge
    of the problem will help. For example, symmetry relations among the parameters
    or (a)periodic boundary conditions should be exploited if possible.

    The specific Monte-Carlo methods are beyond the scope of these lectures.
\end{frame}

\begin{frame}{Simplex Method \hfill (aka Nelder-Mead Method)}
    We will now discuss the work-horse of minimization algorithms,
    the \alert{Simplex} method. While it has very good convergence
    properties, but a rather slow convergence rate.\\
    \vfill
    The basic idea is as follows:
    \begin{itemize}
        \item[0] A set of parameter points must be provided as an initial
        value. If all else fails, random points can be used.
        \item[1] Order the points by their respetive size of $f$
        \item[2] Compute the midpoint $\vec{a}_\text{mid}$ of all points except the worst, 
        and reflect the worst point $\vec{a}_{N+1}$ at the midpoint yielding $\vec{a}_\text{ref}$.
        \begin{itemize}
            \item[a] If $f(\vec{a}_\text{ref})$ fulfills some minimization criteria, replace
            one of the existing simplex points with $\vec{a}_\text{ref}$.
            Continue at step 1.
            \item[b] Otherwise compute $\vec{a}_\text{contr}$ as a linear combination of the worst
            point of the simplex $\vec{a}_N$, and $\vec{a}_\text{ref}$. If $\vec{a}_\text{contr}$
            is better than the worst point, replace the worst point.
            Continue at step 1.
            \item[c] Otherwise compress the the simplex by moving the points
            $\vec{x}_1$ to $\vec{x}_N$ closer to $\vec{x}_0$ on their
            respective connecting lines. Continue at step 1.
        \end{itemize}
        \item[3] If at any point the volume of the simplex falls below a given
        treshold, then stop.
    \end{itemize}
\end{frame}

\begin{frame}{Simplex Method (continued)}
    More details on the previous steps:\\
    \vspace{-\medskipamount}
    \begin{overlayarea}{\textwidth}{.95\textwidth}
    \begin{itemize}
        \only<1-2>{
        \item[2] Compute the midpoint as
        \begin{equation*}
            \vec{a}_\text{mid} = \frac{1}{N} \sum_{n=0}^{N - 1} \vec{a}_n
        \end{equation*}
        The reflection is computed as\hfill~[default: $\alpha = 1$]
        \begin{equation*}
            \vec{a}_\text{ref} = (1 + \alpha) \vec{a}_\text{mid}  - \alpha \vec{a}_N
        \end{equation*}
        }
        \only<2>{
        \item[2a.1] If $\vec{a}_\text{ref}$ is better than $\vec{a}_0$, then replace
        $\vec{a}_N$ with $\vec{a}_\text{ref}$. Continue with step 1.
        \item[2a.1] \alert{alternative}: Compute\hfill~\alert{[default: $\epsilon = 2$]}
        \begin{equation*}
            \vec{a}_\text{exp} = (1 + \epsilon) \vec{a}_\text{mid} - \epsilon \vec{a}_N
        \end{equation*}
        and use in step \alert{2a} the better of the two ($\vec{a}_\text{ref}$ or $\vec{a}_\text{exp}$).
        Continue with step 1.
        }
        \only<3-6>{
        \item[2a.2] If $\vec{a}_\text{ref}$ is better than the second worst point
        $\vec{a}_{N-1}$, then replace the worst point by $\vec{a}_\text{ref}$.
        }
        \only<4-6>{
        \item[2b] Let $\vec{a}_\text{tmp}$ be the better of $\vec{a}_\text{ref}$ and $\vec{a}_\text{N}$.
        Compute\hfill~\alert{[default: $\gamma = 1/2$]}
        \begin{equation*}
            \vec{a}_\text{contr} = \gamma \vec{a}_\text{mid} + (1 - \gamma) \vec{a}_\text{tmp}\,.
        \end{equation*}
        If $\vec{a}_\text{contr}$ is better than the worst point, replace the latter. Continue
        at step 1.
        }
        \only<5-6>{
        \item[2c] Compress the entire simplex\hfill~\alert{[default: $\kappa = 1/2$]}
        \begin{equation*}
            \vec{a}_n = \sigma \vec{a}_0 + (1 - \sigma) \vec{a}_n\qquad \forall n=1,\dots,N
        \end{equation*}
        }
        \only<6>{
        \item[0] The $N+1$ initial points must span a simplex. Therefore, care must be
        taken that they do not lie in a linear subspace of the simplex. This is similar
        to picking points in a plane when constructing a 3D volume.
        }
    \end{itemize}
    \end{overlayarea}
\end{frame}

\begin{frame}{Simplex Method: Properties}
    \begin{itemize}
        \item The method is very robust against problems such as overshooting.
        \item It will usually converge toward a close-by minimum.
        \item However, the convergence rate is smaller than for many other methods,
        which e.g. work on more specialised problems.
        \item No guarantee is given that the simplex method converges toward
        the global minimum. Possible relief comes in the form of several starting
        simplex obtained using Monte-Carlo methods.
    \end{itemize}
\end{frame}

\begin{frame}{Example}
\end{frame}

\begin{frame}{Least-squares Problems}
    The least-squares problem arises from the case of a Gaussian
    likelihood function if all uncertainties are equal. It is a
    very good example for understanding a non-linear problem
    through linearization.\\
    \vfill
    The target function is called the residue $r(a_1, \dots, a_N)$
    with $N$ parameters.
    \begin{equation*}
        r_k \equiv f(\vec{a}, \vec{x}_k) - y_k
    \end{equation*}
    \vfill
    Here $k$ denotes one of the $K$ possible coordinate on a curve,
    with (external, e.g. experimental) inputs $(\vec{x}_k, y_k)$.\\
    \vfill
    The problem now aims to minimize
    \begin{equation*}
        \sum_k^K |r_k|^2
    \end{equation*}
\end{frame}

\begin{frame}{Gauss-Newton Method}
    This iterative method requires partial derivatives of the resiudes
    $r_k$ with respect to the parameters $a_n$:
    \begin{equation*}
        r'_{k,n} \equiv \frac{\partial r(\vec{a}, \vec{x})}{\partial a_n}\Big|_{\vec{x} = \vec{x}_k}
    \end{equation*}
    \vfill
    The algorithm now involves the following quantities:
    \begin{align*}
        D   & = \left(\begin{matrix}
            r'_{1,1} & r'_{1,2} & \dots  & r'_{1,n}\\
            r'_{2,1} & r'_{2,2} & \dots  & r'_{2,n}\\
            \vdots   & \vdots   & \ddots & \vdots  \\
            r'_{k,1} & r'_{k,2} & \dots  & r'_{k,n}
        \end{matrix}\right) &
        \vec{r}
            & = (r_1, \dots, r_k)^T
    \end{align*}
\end{frame}

\begin{frame}{Gauss-Newton Method (cont'd)}
    \begin{itemize}
        \item[0] As always, we will be requiring a starting point $\vec{a}_0$ in parameter space.\\
        \item[1] Update the current point:
        \begin{equation*}
            \vec{a}_{i + 1} = \vec{a}_i - (D^T\cdot D)^{-1} \cdot D \cdot \vec{r}
        \end{equation*}
        \item[2] If $||\vec{a}_{i + 1} - \vec{a}_i|| < T$, where $T$ is some a-prior threshold,
        we stop. Otherwise, continue with step 1.
    \end{itemize}
    Some comments
    \begin{itemize}
        \item The literature usually recommends to compute the auxiliary variable $\vec{s}$ via:
        \begin{equation*}
            (D^T \cdot D) \cdot \vec{s} = D^T \cdot \vec{r}
        \end{equation*}
        The above linear system of equations can be solved with known methods.
        \item Since $(D^T \cdot D)$ is symmetric, it is a good idea to use Cholesky decomposition.
    \end{itemize}
\end{frame}

\begin{frame}{Derivation}
    \url{https://en.wikipedia.org/wiki/Gauss-Newton_algorithm}
\end{frame}

\begin{frame}{Example}
    \url{https://en.wikipedia.org/wiki/Gauss-Newton_algorithm}
\end{frame}

\begin{frame}{Levenberg-Marquardt Method}
    The Levenberg-Marquardt method arises from a modification to the Gauss-Newton method.

    Trust region.

    Dampen:
    \begin{itemize}
        \item[0] As always, we will be requiring a starting point $\vec{a}_0$ in parameter space.\\
        \item[1] Update the current point:
        \begin{equation*}
            \vec{a}_{i + 1} = \vec{a}_i - (D^T\cdot D {\color{red}\,+ \lambda I})^{-1} \cdot D \cdot \vec{r}
        \end{equation*}
        where $I$ is the unit matrix in $N \times N$ {\color{red} and $\lambda$ a real-valued parameter}.
        \item[2] If $||\vec{a}_{i + 1} - \vec{a}_i|| < T$, where $T$ is some a-prior threshold,
        we stop. Otherwise, continue with step 1.
    \end{itemize}

    The optimal choice of {\color{red}$\lambda$} is specific to the problem.
\end{frame}

\begin{frame}
\end{frame}

\backupbegin

\begin{frame}\frametitle{Backup}

\end{frame}

\backupend
\end{document}