Lecture_repo/Lectures_my/MC_2016/Lecture2/mchrzasz.tex at f94deea312cd1ed37c4452f671d54500c91de446

Fork: 0
mchrzasz / Lecture_repo
Find file
Newer
Older
Lecture_repo / Lectures_my / MC_2016 / Lecture2 / mchrzasz.tex
mchrzasz on 3 Mar 2016 32 KB lecture 2 final version and typo fix in lecture 1
Raw Blame History
\documentclass[11 pt,xcolor={dvipsnames,svgnames,x11names,table}]{beamer}

\usepackage[english]{babel} 
\usepackage{polski}         


\usetheme[
	bullet=circle,		% Other option: square
	bigpagenumber,		% circled page number on lower right
	topline=true,			% colored bar at the top of the frame 
	shadow=false,			% Shading for beamer blocks
	watermark=BG_lower,	% png file for the watermark
	]{Flip}

%\logo{\kern+1.em\includegraphics[height=1cm]{SHiP-3_LightCharcoal}}
                            

\usepackage[lf]{berenis}
\usepackage[LY1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{cases}
\usepackage{mathtools}
\usepackage{emerald}
\usefonttheme{professionalfonts}
\usepackage[no-math]{fontspec}		
\defaultfontfeatures{Mapping=tex-text}	% This seems to be important for mapping glyphs properly

\setmainfont{Gillius ADF}			% Beamer ignores "main font" in favor of sans font
\setsansfont{Gillius ADF}			% This is the font that beamer will use by default
% \setmainfont{Gill Sans Light}		% Prettier, but harder to read

\setbeamerfont{title}{family=\fontspec{Gillius ADF}}

\input t1augie.fd

%\newcommand{\handwriting}{\fontspec{augie}} % From Emerald City, free font
%\newcommand{\handwriting}{\usefont{T1}{fau}{m}{n}} % From Emerald City, free font
% \newcommand{\handwriting}{}	% If you prefer no special handwriting font or don't have augie

%% Gill Sans doesn't look very nice when boldfaced
%% This is a hack to use Helvetica instead
%% Usage: \textbf{\forbold some stuff}
%\newcommand{\forbold}{\fontspec{Arial}}

\usepackage{graphicx}
\usepackage[export]{adjustbox}

\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{bm}
\usepackage{colortbl}
\usepackage{mathrsfs} 			% For Weinberg-esque letters
\usepackage{cancel}				% For "SUSY-breaking" symbol
\usepackage{slashed}            % for slashed characters in math mode
\usepackage{bbm}                % for \mathbbm{1} (unit matrix)
\usepackage{amsthm}				% For theorem environment
\usepackage{multirow}			% For multi row cells in table
\usepackage{arydshln} 			% For dashed lines in arrays and tables
\usepackage{siunitx}
\usepackage{xhfill}
\usepackage{grffile}
\usepackage{textpos}
\usepackage{subfigure}
\usepackage{tikz}
\usepackage{hyperref}
%\usepackage{hepparticles}    
\usepackage[italic]{hepparticles}     

\usepackage{hepnicenames} 

% Drawing a line
\tikzstyle{lw} = [line width=20pt]
\newcommand{\topline}{%
  \tikz[remember picture,overlay] {%
    \draw[crimsonred] ([yshift=-23.5pt]current page.north west)
             -- ([yshift=-23.5pt,xshift=\paperwidth]current page.north west);}}



% % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % % %
\usepackage{tikzfeynman}		% For Feynman diagrams
\usetikzlibrary{arrows,shapes}
\usetikzlibrary{trees}
\usetikzlibrary{matrix,arrows} 				% For commutative diagram
% http://www.felixl.de/commu.pdf
\usetikzlibrary{positioning}				% For "above of=" commands
\usetikzlibrary{calc,through}				% For coordinates
\usetikzlibrary{decorations.pathreplacing}  % For curly braces
% http://www.math.ucla.edu/~getreuer/tikz.html
\usepackage{pgffor}							% For repeating patterns

\usetikzlibrary{decorations.pathmorphing}	% For Feynman Diagrams
\usetikzlibrary{decorations.markings}
\tikzset{
	% >=stealth', %%  Uncomment for more conventional arrows
	vector/.style={decorate, decoration={snake}, draw},
	provector/.style={decorate, decoration={snake,amplitude=2.5pt}, draw},
	antivector/.style={decorate, decoration={snake,amplitude=-2.5pt}, draw},
	fermion/.style={draw=gray, postaction={decorate},
		decoration={markings,mark=at position .55 with {\arrow[draw=gray]{>}}}},
	fermionbar/.style={draw=gray, postaction={decorate},
		decoration={markings,mark=at position .55 with {\arrow[draw=gray]{<}}}},
	fermionnoarrow/.style={draw=gray},
	gluon/.style={decorate, draw=black,
		decoration={coil,amplitude=4pt, segment length=5pt}},
	scalar/.style={dashed,draw=black, postaction={decorate},
		decoration={markings,mark=at position .55 with {\arrow[draw=black]{>}}}},
	scalarbar/.style={dashed,draw=black, postaction={decorate},
		decoration={markings,mark=at position .55 with {\arrow[draw=black]{<}}}},
	scalarnoarrow/.style={dashed,draw=black},
	electron/.style={draw=black, postaction={decorate},
		decoration={markings,mark=at position .55 with {\arrow[draw=black]{>}}}},
	bigvector/.style={decorate, decoration={snake,amplitude=4pt}, draw},
}

% TIKZ - for block diagrams, 
% from http://www.texample.net/tikz/examples/control-system-principles/
% \usetikzlibrary{shapes,arrows}
\tikzstyle{block} = [draw, rectangle, 
minimum height=3em, minimum width=6em]




\usetikzlibrary{backgrounds}
\usetikzlibrary{mindmap,trees}	% For mind map
\newcommand{\degree}{\ensuremath{^\circ}}
\newcommand{\E}{\mathrm{E}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand\Ts{\rule{0pt}{2.6ex}}       % Top strut
\newcommand\Bs{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut

\graphicspath{{images/}}	% Put all images in this directory. Avoids clutter.

% SOME COMMANDS THAT I FIND HANDY
% \renewcommand{\tilde}{\widetilde} % dinky tildes look silly, dosn't work with fontspec
\newcommand{\comment}[1]{\textcolor{comment}{\footnotesize{#1}\normalsize}} % comment mild
\newcommand{\Comment}[1]{\textcolor{Comment}{\footnotesize{#1}\normalsize}} % comment bold
\newcommand{\COMMENT}[1]{\textcolor{COMMENT}{\footnotesize{#1}\normalsize}} % comment crazy bold
\newcommand{\Alert}[1]{\textcolor{Alert}{#1}} % louder alert
\newcommand{\ALERT}[1]{\textcolor{ALERT}{#1}} % loudest alert
%% "\alert" is already a beamer pre-defined
\newcommand*{\Scale}[2][4]{\scalebox{#1}{$#2$}}%

\def\Put(#1,#2)#3{\leavevmode\makebox(0,0){\put(#1,#2){#3}}}

\usepackage{gmp}
\usepackage[final]{feynmp-auto}

\usepackage[backend=bibtex,style=numeric-comp,firstinits=true]{biblatex}
\bibliography{bib}
\setbeamertemplate{bibliography item}[text]

\makeatletter\let\frametextheight\beamer@frametextheight\makeatother

% suppress frame numbering for backup slides
% you always need the appendix for this!
\newcommand{\backupbegin}{
	\newcounter{framenumberappendix}
	\setcounter{framenumberappendix}{\value{framenumber}}
}
\newcommand{\backupend}{
	\addtocounter{framenumberappendix}{-\value{framenumber}}
	\addtocounter{framenumber}{\value{framenumberappendix}} 
}


\definecolor{links}{HTML}{2A1B81}
%\hypersetup{colorlinks,linkcolor=,urlcolor=links}

% For shapo's formulas:
\def\lsi{\raise0.3ex\hbox{$<$\kern-0.75em\raise-1.1ex\hbox{$\sim$}}}
\def\gsi{\raise0.3ex\hbox{$>$\kern-0.75em\raise-1.1ex\hbox{$\sim$}}}
\newcommand{\lsim}{\mathop{\lsi}}
\newcommand{\gsim}{\mathop{\gsi}}
\newcommand{\wt}{\widetilde}
%\newcommand{\ol}{\overline}
\newcommand{\Tr}{\rm{Tr}}
\newcommand{\tr}{\rm{tr}}
\newcommand{\eqn}[1]{&\hspace{-0.7em}#1\hspace{-0.7em}&}
\newcommand{\vev}[1]{\rm{$\langle #1 \rangle$}}
\newcommand{\abs}[1]{\rm{$\left| #1 \right|$}}
\newcommand{\eV}{\rm{eV}}
\newcommand{\keV}{\rm{keV}}
\newcommand{\GeV}{\rm{GeV}}
\newcommand{\im}{\rm{Im}}
\newcommand{\disp}{\displaystyle}
\def\be{\begin{equation}}
\def\ee{\end{equation}}
\def\ba{\begin{eqnarray}}
\def\ea{\end{eqnarray}}
\def\d{\partial}
\def\l{\left(}
\def\r{\right)}
\def\la{\langle}
\def\ra{\rangle}
\def\e{{\rm e}}
\def\Br{{\rm Br}}
\def\fixme{{\color{red} FIXME!}}
\def\mc{{\color{Magenta}{MC}}\xspace}
\def\pdf{{\rm p.d.f.}}
\def\ARROW{{\color{JungleGreen}{$\Rrightarrow$}}\xspace}

\author{ {\fontspec{Trebuchet MS}Marcin Chrz\k{a}szcz} (Universit\"{a}t Z\"{u}rich)}
\institute{UZH}
\title[Monte Carlo integration and variance reduction]{Monte Carlo integration and variance reduction}
\date{\fixme}


\begin{document}
\tikzstyle{every picture}+=[remember picture]

{
\setbeamertemplate{sidebar right}{\llap{\includegraphics[width=\paperwidth,height=\paperheight]{bubble2}}}
\begin{frame}[c]%{\phantom{title page}} 
\begin{center}
\begin{center}
	\begin{columns}
		\begin{column}{0.9\textwidth}
			\flushright\fontspec{Trebuchet MS}\bfseries \Huge {Monte Carlo integration\\ and variance reduction}
		\end{column}
		\begin{column}{0.2\textwidth}
		  %\includegraphics[width=\textwidth]{SHiP-2}
		\end{column}
	\end{columns}
\end{center}
	\quad
	\vspace{3em}
\begin{columns}
\begin{column}{0.44\textwidth}
\flushright \vspace{-1.8em} {\fontspec{Trebuchet MS} \Large Marcin Chrząszcz\\\vspace{-0.1em}\small \href{mailto:mchrzasz@cern.ch}{mchrzasz@cern.ch}}

\end{column}
\begin{column}{0.53\textwidth}
\includegraphics[height=1.3cm]{uzh-transp}
\end{column}
\end{columns}

\vspace{1em}
%		\footnotesize\textcolor{gray}{With N. Serra, B. Storaci\\Thanks to the theory support from M. Shaposhnikov, D. Gorbunov}\normalsize\\
\vspace{0.5em}
	\textcolor{normal text.fg!50!Comment}{Monte Carlo methods, \\ 3 March, 2016}
\end{center}
\end{frame}
}




\begin{frame}\frametitle{Monte Carlo and integration}
\begin{footnotesize}
$\hookrightarrow$ {\color{BrickRed}{\textbf{All MC calculations are equivalent to preforming an integration.}}}\\
$\rightrightarrows$ Assumptions: $r_i$ random numbers from $\mathcal{U}(0,1)$. The MC result:
\begin{align*}
F=F(r_1,r_2,...r_n)
\end{align*}
is unbias estimator of an integral:
\begin{align*}
I=\int_0^1...\int_0^1 F(x_1,x_2,...,x_n)dx_1,dx_2...,dx_n
\end{align*}
aka the expected value of the $I$ integral is:
\begin{align*}
E(F)=I.
\end{align*}
    \begin{exampleblock}{}                                                                                                                                                                                                                                                                                                
$\Rrightarrow$ This mathematical identity is the most useful property of the MC methods. It is a link between mathematical analysis and statistic world. Now we can use the best of the both world!
        \end{exampleblock} 
If we want to calculate the integral in different range then $(0,1)$ we just scale the the previous result:
\begin{align*}
\dfrac{1}{N}\sum_{i=1}^N f(x_i) \xrightarrow{N\to \infty} E(f)=\dfrac{1}{b-a}\int_a^b f(x)dx
\end{align*}


\end{footnotesize}
\end{frame}


\begin{frame}\frametitle{Uncertainty from Monte Carlo methods}
\begin{footnotesize}
$\Rrightarrow$ In practice we do not have $N\to \infty$ so we will never know the exact result of an integral :(\\
$\longmapsto$ Let's use the {\color{BrickRed}{statistical}} world and estimate the uncertainty of an integral in this case :)\\
$\rightarrowtail$ A variance of a MC integral:
\begin{align*}
V(\hat{I}) = \dfrac{1}{n} \Big\lbrace E(f^2) - E^2(f) \Big\rbrace = \dfrac{1}{n} \Big\lbrace \dfrac{1}{b-a} \int_a^b f^2(x)dx - I^2 \Big\rbrace
\end{align*}
    \begin{alertblock}{}                                                                                                                                                                                                                                                                                                
$\looparrowright$ To calculate $V(\hat{I})$ one needs to know the value of $I$!
\end{alertblock}
$\Rrightarrow$ In practice $V(\hat{I})$ is calculated via estimator:
\begin{columns}
\column{2in}
\begin{align*}
\hat{V}(\hat{I})=\dfrac{1}{n}\hat{V}(f),
\end{align*}
\column{3in}
\begin{align*}
\hat{V}(f) = \dfrac{1}{n-1}\sum_{i=1}^n  \Big[ f(x_i)-\dfrac{1}{n} \sum_{i=1}^nf(x_i)\Big]^2.
\end{align*}
\end{columns}


$\Rrightarrow$ MC estimator of standard deviation: $\hat{\sigma}=\sqrt{\hat{V}(\hat{I})}$


\end{footnotesize}
\end{frame}


\begin{frame}\frametitle{Buffon needle - $\pi$ number calculus}
\begin{footnotesize}

$\Rrightarrow$ Buffon needle (Buffon 1777, Laplace 1886):
We are throwing a needle (of length $l$) on to a surface covered with parallel lines width distance $L$. If a thrown needle touches a line we count a hit, else miss. Knowing the number of hits and misses one can calculate the $\pi$ number.
\vspace{0.3cm}
\begin{columns}
\column{0.1in}
{~}
\column{2in}
{\color{ForestGreen}{Experiment:}}
\column{2.8in}
{\color{Cerulean}{Theory:}}

\end{columns}


\begin{columns}
\column{0.1in}
{~}
\column{2in}

\includegraphics[width=0.9\textwidth]{images/buffon.png}\\
$n$ -  number of hits\\
$N$ number of hits and misses,\\
aka number of tries.

\column{2.8in}
$\Rightarrow$ x - angle between needle and horizontal line, $x \in \mathcal{U}(0,\pi)$.
$\Rightarrow$ the probability density function (\pdf) for x:
\begin{align*}
\rho(x)=\dfrac{1}{\pi}
\end{align*}
$\Rightarrow$  $p(x)$ probability to hit a line for a given x value:
\begin{align*}
p(x)=\dfrac{l}{L}\vert \cos x \vert
\end{align*}
$\Rightarrow$ Total hit probability:
\begin{align*}
P = E[p(x)]=\int_0^{\pi}p(x)\rho(x)dx=\dfrac{2l}{\pi L}
\end{align*}

\end{columns}
Now one can calculate $\hat{P}$ from MC : $\hat{P}=\dfrac{n}{N} \xrightarrow{N\to \infty} P= \dfrac{2l}{\pi L} \Rightarrow \hat{\pi}=\dfrac{2Nl}{nL}$


\end{footnotesize}
\end{frame}











\begin{frame}\frametitle{Buffon needle - Simplest Carlo method}
\begin{footnotesize}
{\color{MidnightBlue}{Monte Carlo type ''hit or miss''}}\\
Let's use the summery of $p(x)$ function and stake $0<x<\frac{\pi}{2}$.\\
$\Rightarrow$ Algorithm:\\
\begin{columns}
\column{0.1in}
{~}
\column{3.2in}



Generate 2 dim. distribution:
\begin{align*}
(x,y): \mathcal{U}(0,\dfrac{\pi}{2})\times \mathcal{U}(0,1) {\rm{~and~}}
\end{align*}
\begin{align*}
y 
\begin{cases}
  \leq p(x): & \text{hit},\\
> p(x):             & \text{miss}.
\end{cases}
\end{align*}

\column{2.5in}
\includegraphics[width=0.75\textwidth]{images/result.png}



\end{columns}
Let's define weight function: $w(x,y)=\Theta(p(x)-y)$, \\
where $\Theta(x)$ is the step function.\\
$\rightarrowtail$ \pdf : $\varrho(x,y)=\rho(x)g(y)=\frac{2}{\pi} \cdot 1$\\
$\Rightarrow$ Integrated probability:
\begin{align*}
P=E(w)=\int w(x,y) \varrho(x,y)dx dy = \dfrac{2l}{\pi L} \xleftarrow{N\to \infty}\hat{P}=\frac{1}{N} \sum_{i=1}^N w(x_i,y_i)= \dfrac{n}{N}
\end{align*}
Standard deviation for $\hat{P}$: $\hat{\sigma}=\dfrac{1}{\sqrt{N-1}}\sqrt{\dfrac{n}{N}\Big(1-\dfrac{n}{N}\Big)} $

\end{footnotesize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}\frametitle{Heads or tails \mc}

    \begin{exampleblock}{}                                                                                                                                                                                                                                                                                                
$\Rrightarrow$ \mc estimator of an integral that is based on counting the numbers of positive trials compared to the failed ones is called ''hit or miss''
        \end{exampleblock}

\ARROW The probability is described by the Bernoulli distribution:
\begin{equation}
\mathcal{P}(n) = \binom{N}{n} P^n (1-P)^{N-n}, \nonumber
\end{equation}
where $P$ is the probability of success, N is the number of trials and n is the number of successes.\\
\ARROW The following are true:
\begin{equation}
E(n)=NP, \nonumber
\end{equation}
\begin{equation}
V(n)=NP(1-P), \nonumber
\end{equation}
\ARROW Translating this into probability basis:
\begin{equation}
E(\hat{P})=P,{~}{~}V(\hat{P})=\dfrac{P(1-P)}{N}. \nonumber
\end{equation}
\ARROW E2.1 prove the above.


\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}\frametitle{Buffon needle}
\begin{small}


$\Rrightarrow$ Lets make this toy experiment and calculate the $\pi$ number.\\
$\hookrightarrow$ We can simulate the central position $(y)$ of an needle between $(-L, L)$ from $\mathcal{U}(-L, L)$. 

  \begin{exampleblock}{Symmetry:}                                                                                                                                                                                                                                                                                                
Please note the symmetry of the problem, if the position of the needle would be $>L$ then we can shift the needle by any number of $L$'s.
        \end{exampleblock} 

$\hookrightarrow$ New we simulate the angle $(\phi)$ with a flat distribution from $(0,\pi)$. 
$\hookrightarrow$ The maximum and minimum $y$ position of the needle are:
\begin{align*}
y_{\max}=y+\vert \cos \phi \vert l\\
y_{\min}=y-\vert \cos \phi \vert l
\end{align*}
$\hookrightarrow$ Now we check if the needle touches any of the lines: $y=L$, $y=0$ or $y=-L$. If yes we count the events.
\end{small}
\begin{center}


\begin{tiny}


\begin{tabular}{r c c c }
$N$ & $\hat{\pi}$ & $\hat{\pi} - \pi$ & $\sigma(\hat{\pi})$ \\
10000 & $3.12317$ & $-0.01842$ & $0.03047$\\
100000 & $3.14707$ & $0.00547$ & $0.00979$\\
1000000 & $3.13682$ & $-0.00477$ & $0.00307$\\
10000000 & $3.14096$ & $-0.00063$ & $0.00097$\\


\end{tabular}

\end{tiny}
\end{center}
\begin{footnotesize}

\ARROW E2.2 Write the program that calculates the $\pi$ number using the Buffon needle.
\end{footnotesize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{frame}\frametitle{Crude Monte Carlo method of integration}
\begin{footnotesize}
$\Rrightarrow$ {\color{MidnightBlue}{Crude Monte Carlo method of integration is based on Law of Large Numbers: }}\\
\begin{align*}
\dfrac{1}{N} \sum_{i=1}^N f(x_i) \xrightarrow{N\to \infty} \dfrac{1}{b-a}\int_a^b f(x)dx =E(f)
\end{align*}
$\Rrightarrow$ The standard deviation can be calculated:
\begin{align*}
\sigma = \dfrac{1}{\sqrt{N}} \sqrt{\Big[ E(f^2) -E^2(f)\Big] }
\end{align*}

$\Rrightarrow$ From LNT we have:
\begin{align*}
P= \int w(x) \rho(x) dx = \int_0^{\pi/2} (\frac{l}{L} \cos x ) \frac{2}{\pi} dx= \dfrac{2l}{\pi L}  \xrightarrow{N\to \infty} \frac{1}{N}\sum_{i=1}^N w(x_i)
\end{align*}
$\Rrightarrow$ Important comparison between ''Hit and mishit'' and Crude \mc~methods. One can analytically calculate:

\begin{align*}
\hat{\sigma}^{{\rm{Crude}}} < \hat{\sigma}^{{\rm{Hit~and~mishit}}}
\end{align*}


$\Rrightarrow$ Crude \mc~is \textbf{always} better then ''Hit and mishit'' method. We will prove this on an example (can be proven analytically as well).


\end{footnotesize}
\end{frame}


\begin{frame}\frametitle{Crude vs ''hit or miss''}
\begin{footnotesize}
\ARROW The Crude \mc is never worse then the "hit or miss" method.\\
\ARROW Prove: Let's assume we calculate an integral:
\begin{equation}
I=\int_0^1f(x)dx,~{\rm{and~}}0\leq f(x)\leq 1 {~}	\forall x \in (0,1) \nonumber
\end{equation}
\ARROW  The variation for the ''hit-or-miss''(HM) method: $V(\hat{I}_{HM})=\dfrac{1}{N}(I-I^2)$
\ARROW  The variation for the crude method:  $V(\hat{I}_{Crude})=\dfrac{1}{N}[\int_0^1 f^2(x)dx-I^2]$
\ARROW Now the difference:
\begin{align*}
V(\hat{I}_{HM}) - V(\hat{I}_{Crude})= \dfrac{1}{N}[I-\int_0^1 f^2(x)dx]= \dfrac{1}{N}\int_0^1 f(x)[1-f(x)]dx\geq 0~{\rm{q.e.d}}
\end{align*}
\ARROW E2.3 Calculate the following integrals with uncertainties using ''hit or miss'' and crude methods:
\begin{equation}
\int_0^1 dx \dfrac{1}{\sqrt{2\pi}} e^{-\frac{x^2}{2}}  \nonumber
\end{equation}
\begin{equation}
\int_{x^2+y^2\leq 1} \dfrac{1}{4}\sqrt{1-(x^2+y^2)} dx dy \nonumber
\end{equation}


\end{footnotesize}
\end{frame}



\begin{frame}\frametitle{Generalization to multi-dimension case, Crude method}
\begin{footnotesize}

\ARROW Let $x=(x_1, x_2,..., x_n)$- vector in the n-dim vector space $\mathcal{R}^n$.\\
$\Omega \subset \mathcal{R}^n$ - some subspace in the n-dim space.\\
$V\equiv(\Omega)$ - volume of the $\Omega$ subspaces.
\begin{equation}
I=\int_{\Omega} f(x) dx = V \int_{\Omega} f(x) dx/V= V \int_{\Omega} f(x) dp(x)  \equiv V J = V E(f), \nonumber
\end{equation}
where the \mc estimator:
\begin{equation}
\hat{J}=\dfrac{1}{N}\sum_{i=1}^Nf(x^{(i)}),{~}x\in\mathcal{U}(\Omega) \nonumber
\end{equation}
\ARROW The standard deviation:
\begin{equation}
\hat{\sigma}(\hat{J})\dfrac{1}{\sqrt{N(N-1)}}\sqrt{\sum_{i=1}^N f^2(x^{(i)})-\dfrac{1}{N}[\sum_{i=1}^{N}  f(x^{(i)})]^2 } \nonumber
\end{equation}
\ARROW In the end we get:
\begin{equation}
\hat{I}=V\hat{J},{~}{~}\hat{\sigma}(\hat{I})=V\sigma(\hat{J}) \nonumber
\end{equation}

\end{footnotesize}
\end{frame}



\begin{frame}\frametitle{Generalization to multi-dimension case, ''Hit-or-miss''}
\begin{footnotesize}
\ARROW Let $x=(x_1, x_2,..., x_n)$- vector in the n-dim vector space $\mathcal{R}^n$.\\
$\Omega \subset \mathcal{R}^n$ - some subspace in the n-dim space.\\
$V\equiv(\Omega)$ - volume of the $\Omega$ subspaces.
\begin{equation}
I = \int_{\Omega} dx \int_{0}^{f_{max}} dy \Theta(f(x)-y)=Vf_{max} \int_{\Omega} \dfrac{dx}{V} \int_{0}^{f_{max}} \dfrac{dy}{f_{max}} \Theta(f(x)-y) \nonumber
\end{equation}
where $(x,y)\in \mathcal{U}(\Omega \times [0,f_{max}]$.
\ARROW Now we define $K$:
\begin{equation}
K=\int_{\Omega} \dfrac{dx}{V}\int_{0}^{f_{max}} \dfrac{dy}{f_{max}} \Theta(f(x)-y)=E(\Theta) \nonumber
\end{equation}
\ARROW We generator: $(x,y) \in \mathcal{U}(\Omega \times [0,f_{max}])$ and check:
\begin{equation}
y = \begin{cases}
                        \leq f(x) \text{hit, weight=1} \\
                        > f(x) \text{hit, weight=0} \\
                    \end{cases} \nonumber
\end{equation}
\ARROW In the end:
\begin{equation}
\hat{K}=\dfrac{n}{N},{~}{~}{~}\hat{\sigma}(\hat{K})=\dfrac{1}{\sqrt{N-1}}\sqrt{\hat{K}(1-\hat{K})} \nonumber
\end{equation}
\begin{equation}
\hat{I}=f_{max}V\hat{K},{~}{~}{~}\hat{\sigma}(\hat{I})=f_{max}V \hat{\sigma}(\hat{K}) \nonumber
\end{equation}
\end{footnotesize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}\frametitle{Crude MC vs ”Hit and misss”}

$\Rrightarrow$ We can repeat a toy MC studies as we did in the Euler needle case.\\
$\hookrightarrow$ In this example we want to calculate $\int_0^{\pi/2} \cos x dx$

\only<1>{
\begin{center}
\includegraphics[angle=-90,width=0.8\textwidth]{images/results_0.pdf}
\end{center}
}
\only<2>{
\begin{center}
\includegraphics[angle=-90,width=0.8\textwidth]{images/results_1.pdf}
\end{center}
}
\only<3>{
\begin{center}

\includegraphics[angle=-90,width=0.4\textwidth]{images/results_fit_0.pdf}
\includegraphics[angle=-90,width=0.4\textwidth]{images/results_fit_1.pdf}\\
\end{center}
$\Rrightarrow$ One clearly sees that both methods follow $1/\sqrt{N}$ dependence and that the Crude MC is always better then the ''Hit and mishit''.\\
$\Rrightarrow$ Please note that for the ''Hit and mishit'' we are suing 2 times more random numbers than for the Crude method so in terms of timing the Crude MC is also much faster.

}



\end{frame}

\begin{frame}\frametitle{Classical methods of variance reduction}
\begin{footnotesize}

$\Rrightarrow$ In Monte Carlo methods the statistical uncertainty is defined as:
\begin{align*}
\sigma = \dfrac{1}{\sqrt{N}}\sqrt{V(f)}
\end{align*}
$\Rrightarrow$ Obvious conclusion:
\begin{itemize}
\item To reduce the uncertainty one needs to increase $N$.\\
$\rightrightarrows$ Slow convergence. In order to reduce the error by factor of 10 one needs to simulate factor of 100 more points!
\end{itemize}
$\Rrightarrow$ How ever the other handle ($V(f)$) can be changed! $\longrightarrow$ Lot's of theoretical effort goes into reducing this factor.\\
$\Rrightarrow$ We will discuss {\color{Mahogany}{four}} classical methods of variance reduction:
\begin{enumerate}
\item Stratified sampling.
\item Importance sampling.
\item Control variates.
\item Antithetic variates.
\end{enumerate}


\end{footnotesize}
\end{frame}




\begin{frame}\frametitle{Stratified sampling}
\begin{footnotesize}
$\Rrightarrow$ The most intuitive method of variance reduction. The idea behind it is to divide the function in different ranges and to use the Riemann integral property:
\begin{align*}
I = \int_0^1 f(u) du = \int_0^a f(u)du + \int_a^1 f(u) du,~ 0<a<1.
\end{align*}

$\Rrightarrow$ The reason for this method is that in smaller ranges the integration function is more flat. And it's trivial to see that the more flatter you get the smaller uncertainty. 
$\rightrightarrows$ A constant function would have zero uncertainty!

    \begin{exampleblock}{General schematic:}                                                                                                                                                                                                                                                                                                
Let's take our integration domain and divide it in smaller domains. In the $j^{th}$ domain with the volume $w_j$ we simulate $n_j$ points from uniform distribution. We sum the function values in each of the simulated points for each of the domain. Finally we sum them with weights proportional to $w_i$ and anti-proportional to $n_i$. 
        \end{exampleblock} 

\end{footnotesize}
\end{frame}

\begin{frame}\frametitle{Stratified sampling - mathematical details}
\begin{footnotesize}
Let's define our integrals and domains:
\begin{align*}
I=\int_{\Omega} f(x) dx,{~}{~} \Omega=\bigcup_{i=1}^k w_i
\end{align*}
The integral over $j^{th}$ domain:
\begin{align*}
I_j=\int_{w_j} f(x) dx,{~}{~} \Rightarrow I = \sum_{j=1}^k I_i
\end{align*}
$\rightrightarrows$ $p_j$ uniform distribution in the $w_j$ domain: $dp_j=\frac{dx}{w_j}$.\\
$\rightrightarrows$ The integral is calculated based on crude \mc~method. The estimator is equal:
\begin{align*}
\hat{I}_j = \frac{w_j}{n_j} \sum_{i=1}^{n_j}f(x_j^i)
\end{align*}
Now the total integral is just a sum:
\begin{align*}
\hat{I} = \sum_{j=1}^k \hat{I}_j = \sum_{j=1}^k \frac{w_j}{n_j} \sum_{i=1}^{n_j} f(x_j^{(i)}), 
\end{align*}
\begin{columns}
\column{0.2in}
\column{2in}
Variance:
$V(\hat{I})=\sum_{j=1}^k \dfrac{w_j^2}{n_j}V_j(f)$,
\column{3in}



and it's estimator:
$\hat{V} (\hat{I}) = \sum_{j=1}^k \dfrac{w_j^2}{n_j} \hat{V}_j(f)$
\end{columns}

\end{footnotesize}
\end{frame}


\begin{frame}\frametitle{Stratified sampling in practice}
\begin{footnotesize}
    \begin{exampleblock}{~}                                                                                                                                                                                                                                                                                                
One can show that splitting the integration region $\Omega$ into equal regions will not increase the variance!
        \end{exampleblock} 
       \ARROW For example in case of two sub samples:
\begin{align*}
V(I_{{\rm{crude}}}) - V(I_{{\rm{SS}}}) = \dfrac{1}{N}\left[\int_{\omega_1} f(x)dx - \int_{\omega_2} f(x) dx \right]^{-2} \geq 0
\end{align*}
{\color{red}{A2.1} Prove the above.}
    \begin{exampleblock}{Practical advise:}                                                                                                                                                                                                                                                                                                
If we know very little about the integrating function the equal splitting of the $\Omega$ space is the best option!
        \end{exampleblock} 



\end{footnotesize}
\end{frame}




\begin{frame}\frametitle{Stratified sampling for the Buffon needle}
\begin{footnotesize}
\ARROW   Lets apply our Stratified sampling to my favourite :) Buffon needle with 5 samples.
\begin{columns}
\column{2in}
\includegraphics[width=0.95\textwidth]{images/buff.png}
\column{3in}
\ARROW We have $\omega_i=\Omega/5 = \dfrac{\pi}{10}$ and $n_i=\dfrac{N}{5}$.\\
\ARROW The integral estimator:
\begin{align*}
\hat{P}=\dfrac{1}{\Omega}\sum_{j=1}^5 \sum_{i=1}^{N/5} p(x_i^j)=\dfrac{1}{N}\sum_{i=1}^N p(x_i)  
\end{align*}
\ARROW The standard deviation (for $l=L$):
\begin{align*}
\sigma(\hat{\pi})_{SS}  = \dfrac{0.34}{\sqrt{N}} < \sigma(\hat{\pi})_{Crude}  = \dfrac{1.52}{\sqrt{N}}
\end{align*}
\end{columns}
\ARROW In the following example we generated a constant number of events ($N/5$) for each subsample independently of their impact on the integral.\\
\ARROW We can improve this by generating events in each of the sub sample accordingly to the area of the blue rectangle.\\
\ARROW E2.4 Using the Stratified Sampling please calculate the integrals from E2.3 by dividing the are into 5 samples. Compute the errors and compare them to the ones obtained from the Crude method.


\end{footnotesize}
\end{frame}








%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}\frametitle{Importance sampling}
\begin{footnotesize}
$\Rrightarrow$ If the function is changing rapidly in its domain one needs to use a more elegant method: make the function more stable.\\
$\rightrightarrows$ The solution is from first course of mathematical analysis: change the integration variable :)
\begin{align*}
f(x)dx \longrightarrow \frac{f(x)}{g(x)}dG(x),~{\rm{where}}~g(x)=\frac{dG(x)}{dx}
\end{align*}
    \begin{exampleblock}{Schematic:}                                                                                                                                                                                                                                                                                                
\begin{itemize}
\item Generate the distribution from $G(x)$ instead of $\mathcal{U}$.
\item For each generate point calculate the weight: $w(x)=\frac{f(x)}{g(x)}$.
\item We calculate the expected value $\hat{E}(w)$ and its variance $\hat{V}_G(w)$ for the whole sample.
\end{itemize}
        \end{exampleblock} 



\begin{itemize}

\item If $g(x)$ is choose correctly the resulting variance can be much smaller.
\item There are some mathematical requirements:
\begin{itemize}
\item $g(x)$ needs to be non-negative and analytically integrable on its domain.
\item $G(x)$ invertible or there should be a direct generator of $g$ distribution.
\end{itemize}
\end{itemize}


\end{footnotesize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}\frametitle{Importance sampling - Example}
\begin{footnotesize}

$\Rrightarrow$ Let's take our good old $\pi$ determination example.
\vspace{0.3cm}
\begin{columns}
\column{0.1in}
\column{3in}
$\Rrightarrow$ Let's take here for simplicity: $L=l$. 
\begin{itemize}
\item Let's take a trivial linear weight function: $g(x)=\frac{4}{\pi}(1-\frac{2}{\pi}x)$
\item It's invertible analytically: $G(x)=\frac{4}{\pi}x(1-\frac{x}{\pi})$
\item The weight function:
\begin{align*}
w(x)=\frac{p(x)}{g(x)}=\frac{\pi}{4}\frac{\cos x}{1-2x/ \pi}
\end{align*}
\end{itemize}

\column{2in}
\includegraphics[width=0.95\textwidth]{images/result_weight.png}
\end{columns}
\begin{itemize}
\item Now the new standard deviation is smaller:
\end{itemize}
\begin{align*}
\sigma_{\pi}^{{\rm{IS}}} \simeq \frac{0.41}{\sqrt{N}} < \sigma_{\pi}\simeq \frac{1.52}{\sqrt{N}}
\end{align*}
\begin{itemize}
\item Importance sampling has advantages:
\begin{itemize}
\item Big improvements of variance reduction.
\item The only method that can cope with singularities.
\end{itemize}
\end{itemize}

\ARROW Calculate the first function from E2.3 using the importance sampling. As a weight function $g(x)$ take a linear function.
\end{footnotesize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%










\begin{frame}\frametitle{Control variates}
\begin{footnotesize}
$\Rrightarrow$ Control variates uses an other nice property of Riemann integral:
\begin{align*}
\int f(x) dx = \int [f(x)-g(x)]dx+ \int g(x)dx
\end{align*}
\begin{itemize}
\item $g(x)$ needs to be analytically integrable.
\item The uncertainty comes only from the integral: $\int [f(x)-g(x)]dx$. 
\item Obviously: $V(f\to g) \xrightarrow{f\to g} 0$
\end{itemize}
$\Rrightarrow$ Advantages:\\
\begin{itemize}
\item Quite stable, immune to the singularities.
\item $g(x)$ doesn't need to be invertible analytically.
\end{itemize}
$\Rrightarrow$ Disadvantage:\\
\begin{itemize}
\item Useful only if you know $\int g(x)dx$
\end{itemize}

\end{footnotesize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}\frametitle{Antithetic variates}
\begin{footnotesize}
$\Rrightarrow$ In \mc~methods usually one uses the independent random variables. The Antithetic variates method on purpose uses a set of correlated variables (negative correlation is the important property):
\begin{itemize}
\item Let $f$ and $f\prime$ will be functions of x on the same domain.
\item The variance: $V(f+f\prime)=V(f)+V(f\prime)+2 Cov(f,f\prime)$.
\item If $Cov(f,f\prime)<0$ then you can reduce the variance.
\end{itemize}
$\Rrightarrow$ Advantages:
\begin{itemize}
\item If you can pick up $f$ and $f\prime$ so that they have negative correlation one can significantly reduce the variance!
\end{itemize}

$\Rrightarrow$ Disadvantages:
\begin{itemize}
\item There are no general methods to produce such a negative correlations.
\item Hard to generalize this for multidimensional case.
\item You can't generate events from $f(x)$ with this method.
\end{itemize}



\end{footnotesize}
\end{frame}



\begin{frame}\frametitle{Wrap up}
\begin{footnotesize}
$\Rrightarrow$ To sum up:
\begin{itemize}
\item We discussed basic mathematical properties of \mc~methods.
\item We shown that besides the stochastic nature of \mc~ they can be used to determine totally non stochastic quantities.
\item We demonstrated there is a perfect isomorphism between \mc~method and integration.
\item We learned how co calculate integrals and estimate the uncertainties.
\item Finally we discussed several classical methods of variance reduction.
\end{itemize}


\end{footnotesize}
\end{frame}







\backupbegin   


\backupend			

\end{document}