Lecture_repo/Lectures_my/NumMet/2016/Lecture12/lecture12.tex at 3d20f796d168b27c02a44a34e120fa76f0ed21e7

Fork: 0
mchrzasz / Lecture_repo
Find file
Newer
Older
Lecture_repo / Lectures_my / NumMet / 2016 / Lecture12 / lecture12.tex
Danny van Dyk on 21 Nov 2016 18 KB Final version of lecture 12
Raw Blame History
% vim: set sts=4 et :
\documentclass[11 pt,xcolor={dvipsnames,svgnames,x11names,table}]{beamer}

\usetheme[
	bullet=circle,		% Other option: square
	bigpagenumber,		% circled page number on lower right
	topline=true,			% colored bar at the top of the frame 
	shadow=false,			% Shading for beamer blocks
	watermark=BG_lower,	% png file for the watermark
	]{Flip}

\usepackage{bm}
\usepackage{tikz,pgfplots,pgfplotstable}
\usepgfplotslibrary{fillbetween}
\usepackage{graphicx}

% SOME COMMANDS THAT I FIND HANDY
% \renewcommand{\tilde}{\widetilde} % dinky tildes look silly, dosn't work with fontspec
\newcommand{\comment}[1]{\textcolor{comment}{\footnotesize{#1}\normalsize}} % comment mild
\newcommand{\Comment}[1]{\textcolor{Comment}{\footnotesize{#1}\normalsize}} % comment bold
\newcommand{\COMMENT}[1]{\textcolor{COMMENT}{\footnotesize{#1}\normalsize}} % comment crazy bold
\newcommand{\Alert}[1]{\textcolor{Alert}{#1}} % louder alert
\newcommand{\ALERT}[1]{\textcolor{ALERT}{#1}} % loudest alert
%% "\alert" is already a beamer pre-defined
\newcommand*{\Scale}[2][4]{\scalebox{#1}{$#2$}}%

\graphicspath{{images/}}	% Put all images in this directory. Avoids clutter.

% suppress frame numbering for backup slides
% you always need the appendix for this!
\newcommand{\backupbegin}{
	\newcounter{framenumberappendix}
	\setcounter{framenumberappendix}{\value{framenumber}}
}
\newcommand{\backupend}{
	\addtocounter{framenumberappendix}{-\value{framenumber}}
	\addtocounter{framenumber}{\value{framenumberappendix}} 
}

\begin{document}
\tikzstyle{every picture}+=[remember picture]

{
\setbeamertemplate{sidebar right}{\llap{\includegraphics[width=\paperwidth,height=\paperheight]{bubble2}}}
\begin{frame}[c]%{\phantom{title page}} 
\begin{center}
\begin{center}
	\begin{columns}
		\begin{column}{0.9\textwidth}
			\flushright%\fontspec{Trebuchet MS}
			\bfseries \Huge {Function Minimization (I)}
		\end{column}
		\begin{column}{0.2\textwidth}
		  %\includegraphics[width=\textwidth]{SHiP-2}
		\end{column}
	\end{columns}
\end{center}
	\quad
	\vspace{3em}
\begin{columns}
\begin{column}{0.6\textwidth}
\flushright \vspace{-1.8em} {%\fontspec{Trebuchet MS}
    \large Marcin Chrzaszcz, Danny van Dyk\\\vspace{-0.1em}\small \href{mailto:mchrzasz@cern.ch}{mchrzasz@cern.ch}, \href{mailto:danny.van.dyk@gmail.com}{danny.van.dyk@gmail.com}}

\end{column}
\begin{column}{0.4\textwidth}
\includegraphics[height=1.3cm]{uzh-transp}
\end{column}
\end{columns}

\vspace{1em}
\vspace{0.5em}
	\textcolor{normal text.fg!50!Comment}{Numerical Methods, \\ 21. November, 2016}
\end{center}
\end{frame}
}

\begin{frame}{Plan for today}
    \begin{itemize}
        \item \alert{What types of function do we usually need to minimize?}\\
            \begin{itemize}
                \item general non-linear problem
                \item least-squares
            \end{itemize}
        \vfill
        \item \alert{What information can we use?}\\
            \begin{itemize}
                \item the target function
                \item its first derivatives
                \item its second derivatives
            \end{itemize}
    \end{itemize}
\end{frame}

\begin{frame}{General Minimization Problem}
    The general problem can be posited as follows: There exists
    a function
    \begin{equation*}
        f(a_1, \dots, a_N)
    \end{equation*}
    with real-valued parameters $\lbrace a_n \rbrace$.\\
    \vfill
    We intend to find
    \begin{itemize}
        \item \alert{one},
        \item \alert{any}, or
        \item \alert{all}
    \end{itemize}
    of its minima on a support $A$,
    \begin{equation*}
        A = \lbrace (a_1, \dots, a_N)\,|\,g(a_1, \dots, a_N) > 0\rbrace\,.
    \end{equation*}
    In order to find maxima, flip the function around: $f \to -f$.
\end{frame}

\begin{frame}{Example}
    Posterior probability density function with non-gaussian likelihood.

    \begin{columns}
    \begin{column}{.6\textwidth}
    \includegraphics[height=.8\textheight]{fig-bclvub-comparison-0-1}
    \hfill {\small taken from 1409.7186}
    \end{column}
    \begin{column}{.4\textwidth}
    \begin{itemize}
        \item shallow gradients in one direction
        \item step gradients in the other
        \item non-symmetric shape
        \item slight ``banana'' qualities\\
            (slightly bent in along one axis)
    \end{itemize}
    \end{column}
    \end{columns}
\end{frame}

\begin{frame}{Limitations and Monte-Carlo Methods}
    In general we have no further information on $f$, i.e., we do not
    \emph{analytically} know any of its derivatives.\\
    \vfill
    A very popular way to explore $f$ is by using Monte Carlo methods,
    e.g. plain random walks, Markov chain methods, or genetic algorithms.
    Most of these methds are very good at delineating \emph{local environments}
    around some/most/all(?) of the modes.\\
    \vfill
    As always, analytic knowledge
    of the problem will help. For example, symmetry relations among the parameters
    or (a)periodic boundary conditions should be exploited if possible.\\
    \vfill
    The specific Monte-Carlo methods are beyond the scope of these lectures.
\end{frame}

\begin{frame}{Simplex Method \hfill (aka Nelder-Mead Method)}
    We will now discuss the work-horse of minimization algorithms,
    the \alert{Simplex} method. While it has very good convergence
    properties, but a rather slow convergence rate.\\
    \vfill
    The basic idea is as follows:
    \begin{itemize}
        \item<1->[0] A set of parameter points must be provided as an initial
        value. If all else fails, random points can be used.
        \item<2->[1] Order the points by their respetive size of $f$
        \item<3->[2] Compute the midpoint $\vec{a}_\text{mid}$ of all points except the worst, 
        and reflect the worst point $\vec{a}_{N+1}$ at the midpoint yielding $\vec{a}_\text{ref}$.
        \begin{itemize}
            \item<4->[a] If $f(\vec{a}_\text{ref})$ fulfills some minimization criteria, replace
            one of the existing simplex points with $\vec{a}_\text{ref}$.
            Continue at step 1.
            \item<5->[b] Otherwise compute $\vec{a}_\text{contr}$ as a linear combination of the worst
            point of the simplex $\vec{a}_N$, and $\vec{a}_\text{ref}$. If $\vec{a}_\text{contr}$
            is better than the worst point, replace the worst point.
            Continue at step 1.
            \item<6->[c] Otherwise compress the the simplex by moving the points
            $\vec{x}_1$ to $\vec{x}_N$ closer to $\vec{x}_0$ on their
            respective connecting lines. Continue at step 1.
        \end{itemize}
        \item<7->[3] If at any point the volume of the simplex falls below a given
        treshold, then stop.
    \end{itemize}
\end{frame}

\begin{frame}{Simplex Method (continued)}
    More details on the previous steps:\\
    \vspace{-\medskipamount}
    \begin{overlayarea}{\textwidth}{.95\textwidth}
    \begin{itemize}
        \only<1-2>{
        \item[2] Compute the midpoint as
        \begin{equation*}
            \vec{a}_\text{mid} = \frac{1}{N} \sum_{n=0}^{N - 1} \vec{a}_n
        \end{equation*}
        The reflection is computed as\hfill~\alert{[default: $\alpha = 1$]}
        \begin{equation*}
            \vec{a}_\text{ref} = (1 + \alpha) \vec{a}_\text{mid}  - \alpha \vec{a}_N
        \end{equation*}
        }
        \only<2>{
        \item[2a.1] If $\vec{a}_\text{ref}$ is better than $\vec{a}_0$, then replace
        $\vec{a}_N$ with $\vec{a}_\text{ref}$. Continue with step 1.
        \item[2a.1] \alert{alternative}: Compute\hfill~\alert{[default: $\epsilon = 2$]}
        \begin{equation*}
            \vec{a}_\text{exp} = (1 + \epsilon) \vec{a}_\text{mid} - \epsilon \vec{a}_N
        \end{equation*}
        and use in step \alert{2a} the better of the two ($\vec{a}_\text{ref}$ or $\vec{a}_\text{exp}$).
        Continue with step 1.
        }
        \only<3-6>{
        \item[2a.2] If $\vec{a}_\text{ref}$ is better than the second worst point
        $\vec{a}_{N-1}$, then replace the worst point by $\vec{a}_\text{ref}$.
        }
        \only<4-6>{
        \item[2b] Let $\vec{a}_\text{tmp}$ be the better of $\vec{a}_\text{ref}$ and $\vec{a}_\text{N}$.
        Compute\hfill~\alert{[default: $\gamma = 1/2$]}
        \begin{equation*}
            \vec{a}_\text{contr} = \gamma \vec{a}_\text{mid} + (1 - \gamma) \vec{a}_\text{tmp}\,.
        \end{equation*}
        If $\vec{a}_\text{contr}$ is better than the worst point, replace the latter. Continue
        at step 1.
        }
        \only<5-6>{
        \item[2c] Compress the entire simplex\hfill~\alert{[default: $\kappa = 1/2$]}
        \begin{equation*}
            \vec{a}_n = \kappa \vec{a}_0 + (1 - \kappa) \vec{a}_n\qquad \forall n=1,\dots,N
        \end{equation*}
        }
        \only<6>{
        \item[0] The $N+1$ initial points must span a simplex. Therefore, care must be
        taken that they do not lie in a linear subspace of the simplex. This is similar
        to picking points in a plane when constructing a 3D volume.
        }
    \end{itemize}
    \end{overlayarea}
\end{frame}

\begin{frame}{Simplex Method: Properties}
    \begin{itemize}
        \item The method is very robust against problems such as overshooting.
        \item It will usually converge toward a close-by minimum.
        \item However, the convergence rate is smaller than for many other methods,
        which e.g. work on more specialised problems.
        \item No guarantee is given that the simplex method converges toward
        the global minimum. Possible relief comes in the form of several starting
        simplex obtained using Monte-Carlo methods.
    \end{itemize}
\end{frame}

\begin{frame}{Example}
    \begin{equation*}
        f(x, y) = x^2 + y^2
    \end{equation*}
    \begin{columns}[T]
    \begin{column}{.55\textwidth}
    \resizebox{1.10\textwidth}{!}{
    \begin{tikzpicture}
        \begin{axis}[
                xmin=-2,xmax=+2,
                ymin=-2,ymax=+2,
                grid=both,
                axis equal
        ]
            \addplot [domain=0:360,samples=100]
                ({cos(x)}, {sin(x)});
            \addplot [domain=0:360,samples=100]
                ({sqrt(2) * cos(x)}, {sqrt(2) * sin(x)});
            \addplot [domain=0:360,samples=100]
                ({sqrt(3) * cos(x)}, {sqrt(3) * sin(x)});
            \addplot [domain=0:360,samples=100]
                ({sqrt(4) * cos(x)}, {sqrt(4) * sin(x)});
            \addplot [domain=0:360,samples=100]
                ({sqrt(5) * cos(x)}, {sqrt(5) * sin(x)});

            \only<1-2>{
            \draw[red,thick,mark=none,fill=red,fill opacity=0.5]
                (axis cs: +1,-1) -- (axis cs: 1,+1) -- (axis cs: +2,+1) -- cycle;
            }
            \only<2>{
            \addplot[black,mark=x,mark size=3pt,thick] coordinates {
                (+1,+0)
                (+0,-1)
            };
            \addplot[black,mark=none,thick,opacity=0.5] coordinates {
                (+2,+1)
                (+1,+0)
                (+0,-1)
            };
            }
            \only<3-5>{
            \draw[red,thick,mark=none,fill=red,fill opacity=0.5]
                (axis cs: 0, -1) -- (axis cs: +1,-1) -- (axis cs: 1,+1) -- cycle;
            }
            \only<4>{
            \addplot[black,mark=x,mark size=3pt,thick] coordinates {
                (+0.5,-1)
                (+0.0,-3)
            };
            \addplot[black,mark=none,thick,opacity=0.5] coordinates {
                (+1.0,+1)
                (+0.5,-1)
                (+0.0,-3)
            };
            }
            \only<5>{
            \addplot[black,mark=x,mark size=3pt,thick] coordinates {
                (+0.50,-1)
                (+0.75,-0)
            };
            \addplot[black,mark=none,thick,opacity=0.5] coordinates {
                (+1.00,+1)
                (+0.75,-0)
                (+0.50,-1)
            };
            }
            \only<6>{
            \draw[red,thick,mark=none,fill=red,fill opacity=0.5]
                (axis cs: 0.75, 0) -- (axis cs: 0, -1) -- (axis cs: +1,-1) -- cycle;
            }
            \only<7>{
            \draw[red,thick,mark=none,fill=red,fill opacity=0.5]
                (axis cs: -0.3125, 0.25) -- (axis cs: 0.75, 0) -- (axis cs: 0, -1) -- cycle;
            }
        \end{axis}
    \end{tikzpicture}
    }
    \end{column}
    \begin{column}{.45\textwidth}
        \begin{overlayarea}{\textwidth}{.3\textheight}
        \only<1-2>{
        \begin{align*}
            a_0 & = (+1, -1) & f(a_0) & = 2\\
            a_1 & = (+1, +1) & f(a_1) & = 2\\
            \only<2>{\color{red}}
            a_2 & = (+2, +1) & f(a_2) & = 5
        \end{align*}
        }
        \only<3-4>{
        \begin{align*}
            a_0 & = (+0, -1) & f(a_0) & = 1\\
            a_1 & = (+1, -1) & f(a_1) & = 2\\
            \only<4>{\color{red}}
            a_2 & = (+1, +1) & f(a_2) & = 2
        \end{align*}
        }
        \end{overlayarea}
        \begin{overlayarea}{\textwidth}{.3\textheight}
        \only<2>{
        \begin{align*}
            a_\text{mid}
                & = (+1,  0) &             \\
            a_\text{ref}
                & = (0, -1)  & f(a_\text{ref}) & = 1
        \end{align*}
        }
        \only<4>{
        \begin{align*}
            a_\text{mid}
                & = (+1/2,-1) &             \\
            a_\text{ref}
                & = (0, -3)  & f(a_\text{ref}) & = 9
        \end{align*}
        }
        \end{overlayarea}
    \end{column}
    \end{columns}
\end{frame}

\begin{frame}{Least-squares Problems}
    The target function is called the residue $r(a_1, \dots, a_N)$
    with $N$ parameters.
    \begin{equation*}
        r_k \equiv y(\vec{a}, \vec{x}_k) - y_k
    \end{equation*}
    \vfill
    Here $k$ denotes one of the $K$ possible coordinate on a curve,
    with (external, e.g. experimental) inputs $(\vec{x}_k, y_k)$.\\
    \vfill
    The problem now aims to minimize
    \begin{equation*}
        f(\vec{a}) \equiv \sum_k^K |r_k|^2
    \end{equation*}
    \vfill
    The least-squares problem arises from the case of a Gaussian
    likelihood function if all uncertainties are equal. It is a
    very good example for understanding a non-linear problem
    through linearization.
\end{frame}

\begin{frame}{Gauss-Newton Method}
    This iterative method requires partial derivatives of the resiudes
    $r_k$ with respect to the parameters $a_n$:
    \begin{equation*}
        r'_{k,n} \equiv \frac{\partial r(\vec{a}, \vec{x})}{\partial a_n}\Big|_{\vec{x} = \vec{x}_k}
    \end{equation*}
    \vfill
    The algorithm now involves the following quantities:
    \begin{align*}
        J   & = \left(\begin{matrix}
            r'_{1,1} & r'_{1,2} & \dots  & r'_{1,n}\\
            r'_{2,1} & r'_{2,2} & \dots  & r'_{2,n}\\
            \vdots   & \vdots   & \ddots & \vdots  \\
            r'_{k,1} & r'_{k,2} & \dots  & r'_{k,n}
        \end{matrix}\right) &
        \vec{r}
            & = (r_1, \dots, r_k)^T
    \end{align*}
    Here $J$ is the Jacobi matrix of the residue.
\end{frame}

\begin{frame}{Gauss-Newton Method (cont'd)}
    \begin{itemize}
        \item[0] As always, we will be requiring a starting point $\vec{a}_0$ in parameter space.\\
        \item[1] Update the current point:
        \begin{equation*}
            \vec{a}_{i + 1} = \vec{a}_i - (J^T\cdot J)^{-1} \cdot J \cdot \vec{r}
        \end{equation*}
        \item[2] If $||\vec{a}_{i + 1} - \vec{a}_i|| < T$, where $T$ is some a-prior threshold,
        we stop. Otherwise, continue with step 1.
    \end{itemize}
    Some comments
    \begin{itemize}
        \item The literature usually recommends to compute the auxiliary variable $\vec{s}$ via:
        \begin{equation*}
            (J^T \cdot J) \cdot \vec{s} = J^T \cdot \vec{r}
        \end{equation*}
        The above linear system of equations can be solved with known methods.
        \item Since $(J^T \cdot J)$ is symmetric, it is a good idea to use Cholesky decomposition.
    \end{itemize}
\end{frame}

\begin{frame}{Derivation}
    Why does this work? Necessary and sufficient conditions for an optimum are:
    \begin{equation*}
        \vec{g} \equiv \frac{\partial f(\vec{a})}{\partial a_n} \overset{!}{=} 0 \qquad\text{and}\qquad\det{h} \overset{!}{\neq} 0\qquad\text{with} \quad h \equiv\frac{\partial^2 f}{\partial a_{n}\,\partial a_{n'}}
    \end{equation*}
    One could therefore use Newton's method to find the zeros of the gradient:
    \begin{equation*}
        \vec{a}_{i + 1} = \vec{a}_i - (h)^{-1} \vec{g}
    \end{equation*}
    Express $\vec{g}$ and $h$ in terms of the $r'_{k,n}$:
    \begin{align*}
        g_n & = \sum_k^{K} 2 \frac{\partial r_k}{\partial a_n} r_k &
        H_{n,n'} & = \sum_k^{K} 2 \frac{\partial^2 r_k}{\partial a_n\,\partial a_{n'}} r_k
                    + 2 \frac{\partial r_k}{\partial a_n} \frac{\partial r_k}{\partial a_{n'}}\\
            & = 2 J \cdot \vec{r} &
            & = {\color{red}\left[\sum_{k}^K 2 \frac{\partial^2 r_k}{\partial a_n\,\partial a_{n'}}\right]} + 2 J^T \cdot J
    \end{align*}
    Assuming the {\color{red}second derivatives} are small compared to the square first-deriv.~term, than
    we can neglect hem.
\end{frame}

\begin{frame}{Example}
    \begin{equation*}
        y(x, a_1, a_2) = \frac{a_1 x}{a_2 + x}\qquad\text{7 data points}
    \end{equation*}
    \includegraphics[height=.78\textheight]{fig-gauss-newton.pdf}
\end{frame}

\begin{frame}{Levenberg-Marquardt Method}
    The Levenberg-Marquardt method arises from a modification to the Gauss-Newton method.
    \vfill
    The adjustment length is attenuated through a dampening parameter {\color{red}$\lambda$}.
    Steers adjustment away from Gauss-Newton direction to the gradient's direction.
    \vfill
    \begin{itemize}
        \item[0] As always, we will be requiring a starting point $\vec{a}_0$ in parameter space.\\
        \item[1] Update the current point:
        \begin{equation*}
            \vec{a}_{i + 1} = \vec{a}_i - (J^T\cdot J {\color{red}\,+ \lambda I})^{-1} \cdot J \cdot \vec{r}
        \end{equation*}
        where $I$ is the unit matrix in $N \times N$ {\color{red} and $\lambda$ a real-valued parameter}.
        \item[2] If $||\vec{a}_{i + 1} - \vec{a}_i|| < T$, where $T$ is some a-prior threshold,
        we stop. Otherwise, continue with step 1.
    \end{itemize}

    The optimal choice of {\color{red}$\lambda$} is specific to the problem.
\end{frame}

\begin{frame}{Example}
    \begin{equation*}
        y(x, a_1, a_2) = \frac{a_1 x}{a_2 + x}\qquad\text{7 data points}
    \end{equation*}
    \includegraphics[height=.78\textheight]{fig-levenberg-marquardt.pdf}
\end{frame}

\backupbegin

\begin{frame}\frametitle{Backup}

\end{frame}

\backupend
\end{document}