Newer
Older
Presentations / Kaggle / kagg.tex
@Marcin Chrząszcz Marcin Chrząszcz on 9 Feb 2015 14 KB added IFJ seminar
\documentclass[xcolor=svgnames]{beamer}
\usepackage[utf8]{inputenc}
\usepackage[english]{babel}
\usepackage{polski}
%\usepackage{amssymb,amsmath}
%\usepackage[latin1]{inputenc}
%\usepackage{amsmath}
%\newcommand\abs[1]{\left|#1\right|}
\usepackage{amsmath}
\newcommand\abs[1]{\left|#1\right|}
\usepackage{hepnicenames}
\usepackage{hepunits}
\usepackage{color}
\usepackage{feynmp}
\usepackage{pst-pdf}
\usepackage{hyperref}
\usepackage{xcolor}
%\usepackage{fontspec}
%\newfontfamily\DejaSans{DejaVu Sans}


\setbeamertemplate{footline}{\insertframenumber/\inserttotalframenumber}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\definecolor{mygreen}{cmyk}{0.82,0.11,1,0.25}



%\DeclareCaptionFont{uiblack}{\color{uiblack}}
%\DeclareCaptionFont{uipoppy}{\color{uipoppy}}
%\captionsetup{labelfont={uipoppy},textfont=uiblack}

% see the macros.tex file for definitions





\renewcommand{\PKs}{{\HepParticle{K}{S}{}\xspace}}
\newcommand{\at}{\makeatletter @\makeatother}

%--------------------------------------------------------------------
%                           Introduction
%--------------------------------------------------------------------

\usetheme{Sybila}

\title[Kaggle challenge from LHCb]{Kaggle challenge from LHCb}
\author{Thomas Blake$^{1}$, Marc-Olivier Bettler$^{2}$, Marcin Chrz\k{a}szcz$^{3,4}$,\\ Francesco Dettori$^{2}$, Andrey Ustyuzhanin$^{5,6}$, Tatiana Likhomanenko$^{5,6}$}

\institute{
$^1$~University of Warwick\\
$^2$~CERN, Geneva \\
$^3$~University of Zurich \\
$^4$~Institute of Nuclear Physics, Krakow \\
$^5$~Yandex School of Data Analysis,~Moscow \\
$^6$~~NRC ``Kurchatov Institute'',~Moscow
}
\date{27 February 2015}
\begin{document}
% --------------------------- SLIDE --------------------------------------------
\frame[plain]{\titlepage}
\author{Marcin Chrz\k{a}szcz}
% ------------------------------------------------------------------------------
% --------------------------- SLIDE --------------------------------------------

\institute{~(UZH, IFJ)}

%--------------------------------------------------------------------
%                          OUTLINE
%--------------------------------------------------------------------





\begin{frame}\frametitle{What is kaggle - The Home of Data Science}
\begin{columns}
\column{3.5in}
\begin{itemize}
        \item Kaggle is the world's largest portal for the data science community.
        \item It provides the possibility for data scientists to solve real-world problems across a diverse array of industries including life sciences, financial services, energy, information technology.
        \item Enables participants use Kaggle to meet, learn, network and collaborate with experts from related fields.
          \item Usually each contest/challange has an cash reward ($13 \rm k\$$ in ATLAS case.) 
        
\end{itemize}
\column{0.1in}
{~}\\

\column{1.3in}
Who has alrady\\ used Kaggle: \\{~}\\
\includegraphics[height=0.9cm]{images/NASA.png}\hspace*{0.3cm}
\includegraphics[height=0.9cm]{images/download.png}\hspace*{0.3cm}\\
%\includegraphics[height=0.9cm]{images/ms.png}\hspace*{0.3cm}\\
\includegraphics[height=0.9cm]{images/ford.jpg}\hspace*{0.3cm}\\
\includegraphics[height=0.9cm]{images/fb.jpg}\hspace*{0.3cm}  \\
and many many\\ others including:
\includegraphics[height=0.9cm]{images/ATLAS-chrome-logo-blue_hi.png}


\end{columns}

\end{frame}








\begin{frame}\frametitle{How does Kaggle work?}
\begin{itemize}
\item One (ex. LHCb) defines a data analysis problem, provides data sets and rules to rank solutions.
\item For the constests there are allocated prizes.
\item When a contest is over (couple of months) the top solutions are made public.
\item Current contests:
\end{itemize}
\begin{center}
\includegraphics[width=10cm]{images/kagg_now.png}
\end{center}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}\frametitle{Atlas path to $\color{white}{H \to \tau \tau}$}
\begin{columns}
\column{3in}
\begin{itemize}
\item Because of its poorer vertex resolution, ATLAS is less sensitive than CMS in modes like $\PHiggs \to \Ptau \Ptau$ or $\PHiggs \to \PB \APB$
\item ATLAS gave their MC samples datasets (for $\PHiggs \to \Ptau \Ptau$) to train the classifiers that can be used for future analyses.
\item After evaluation they gained $\sim 10\%$ on sensitivity!
\item Other analysis from ATLAS picked up some open source libraries and are using them.
\item During the constest there are discussions between the participants, which are also avaible for physicist $\rightarrow$ knowledge transfer.
\item Over 1.800 teams pariticipated in this contest!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%55
%\item This would be great for $\Ptau \to \Pmu \Pmu \Pmu$, LHCb and HEP community as well.
%\item Also after the challenge they organized a workshop $\to$ other LHCb analysis could benefit from this kind of collaborations, we almost always use MVAs.

\end{itemize}

\column{2in}
\includegraphics[width=1.9in]{images/higgs.png}\\
\includegraphics[width=1.9in]{images/HiggsML-sm.jpg}
\end{columns}

\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%55

\begin{frame}\frametitle{Future of $\color{white}{\tau \to \mu\mu\mu}$ in LHCb}
\begin{columns}
\column{3in}
\begin{itemize}
\item In $3~\invfb$ we had expected limit of $5.0 \times 10^{-8}$.
\item What can we expect after another $5~\invfb$?
\item $\dfrac{5.0\times 10^{-8}}{ \sqrt{\dfrac{5\times 2}{3} }}=2.7 \times 10^{-8}$
\item We should aim to do better then Belle $(2.1\times 10^{-8}$)!
\item Help from Kaggle community would be very appreciated for $\Ptau \to \Pmu \Pmu \Pmu$, LHCb and HEP community as well.
\end{itemize}

\column{2in}
\includegraphics[width=1.9in]{images/bananaB.pdf}

\end{columns}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\begin{frame}\frametitle{Comparison with Higgs Boson Challenge}
{
\fontsize{10}{12}
\selectfont
Proposed challenge is connected with Higgs Boson challenge, but differs from it, being more realistic (closer to real physics analysis):
\begin{enumerate}
        \item training dataset includes not only simulated data but also real data (signal-like events and background-like events have different nature).
        \item test dataset includes also control channel to access DATA/MC differences.
        \item the submission must pass additional checks (classifier must not be correlated with mass and behave similarly on real and simulated data).
        \item our quality metric uses predicted probability in all bins as we do in real analysis.
        \item all scripts for testing mass correlation, DATA/MC agreement and limit evaluation are provided by us.
          \item $\Ptau$ in LHCb comes from five different sources which makes this contest more interesting for machine learning people.
\end{enumerate}
}
\end{frame}

\begin{frame}\frametitle{What would we like to give}
\begin{small}
\begin{itemize}
  \item Full MC sample of $\PDs \to \phi \pi$ and $\sim 30\%$ data $\PDs \to \phi \pi$.
  \item Full MC sample of $\Ptau \to \Pmu \Pmu \Pmu$ and full data of $\Ptau \to \Pmu \Pmu \Pmu$.\footnote{I will comment on protecting our data in couple of slides}.
\item DATA will contain our standard ntuple entries (excluding lumi etc.)
\item We want to give as much of those as possible to allow people to construct their own variables (happens often).
\item The size of the data sets and the split training/testing is up to us.
\end{itemize}
\end{small}
\begin{center}
\includegraphics[scale=0.13]{Diagram1.png}
\end{center}
\end{frame}

\begin{frame}\frametitle{Evaluation}

\begin{itemize}
        \item Check correlation between mass and model predictions on all test $\tau \to\mu\mu\mu$ sidebands using Cramer-von Mises measure~(\href{http://arxiv.org/abs/1410.4140}{ex. arXiv:1410.4140})
        \item Check agreement between MC and data on $\PDs\to\phi\pi$ (test MC and test data) using Kolmogorov-Smirnov distance
        \item Calculation of Approximate Binned Median Statistics (ABMS), only if above two tests are passed.
        \item We need to use ABMS because the standard CLs method is computationally too expensive.
        \item ABMS is just value of statistic and shows how well two hypothesis can be distinguished
        \item ABMS is similar to the AMS metric used in Higgs competition, but involves all available statistics, making it more meaningful and stable. Details of the metric can be found in backups.
\end{itemize}
\begin{itemize}
        \item Participants chooses number of bins and bins thresholds themselves (i.e. splitting classifier output)
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5

\begin{frame}\frametitle{Evaluation Examples}
Ada Boost, Gradient Boost were trained (as participants can do). Also Ada Boost and Uniform Gradient Boost were trained using mass as input for classifier. (CVM - Cramer-von Mises metric, KS - Kolmogorov-Smirnov metric),

\begin{center}
  \begin{tabular} {| l | r | r | r | r |}
    \hline
    {~}& ada & ada(mass) & gb & ugb(mass) \\ \hline  \hline
        CVM metric &  0.005674 & \textcolor{red} {0.061837} &\textcolor{blue}{ 0.005642 }& 0.005714 \\ \hline
        CVM p-value & 0.918667 & \textcolor{red}{1.000000} & \textcolor{blue}{0.850000} & 0.970000 \\ \hline
        KS distance & 0.028815 & 0.018353 & 0.027854 & \textcolor{blue}{0.025621} \\ \hline
        ABMS public & 1.557205 & 1.790282 & \textcolor{blue}{1.564490} & 1.545545 \\ \hline
        ABMS private & 1.549253 & 1.785880 & \textcolor{blue}{1.562412} & 1.542357 \\ \hline
  \end{tabular}
\end{center}

\begin{itemize}

\item We tested already with standard classifiers that this metric work and reject cases where people will try to do something strange (like add mass for training).

\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}\frametitle{Protecting our data}
\begin{columns}
\column{3in}
\begin{itemize}
\item We could make our data open without any modifications, as performing any analisis without knowledge of the preselection is not possible.
\item If collaborations feels strong about protecting our data, we can smear/shift it in a way that physcics analysis is not possible, but the training is not distorted.
\item Both scenarios are acceptable for us.
\end{itemize}
\column{2in}
{~}
\includegraphics[width=1.9in]{images/images.jpg}
\end{columns}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}\frametitle{Timescale}
\begin{itemize}
        \item Feb: ask OK from LHCb, allocate prize budget at Yandex (15k \$, more then ATLAS had ;) )
        \item Mar: prepare website, explanatory materials, refine evaluation procedures, test challenge
        \item Mar: propose workshop at KDD/NIPS
        \item Apr: announce challenge, start
        \item Apr-Jun: run challenge
        \item July: announce winners
        \item Aug/Sep: run KDD/NIPS workshop, award winners
\end{itemize}
\begin{enumerate}
\item KDD/NIPS are very well know maschine learning contests.
\item Plan would be to have a sesion there dedicated to our challange (was the case of ATLAS competition).
\item Afterwards we could organise workshop at CERN as well to hopefullys start a fruitfull collaboration.
\end{enumerate}


\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}\frametitle{Conclusions}
\begin{enumerate}
        \item Kaggle constes for LHCb would be very beneficial for us.
        \item A lot of work has been put in to make the contest as usable for us as possible(correlations check, MC/DATA disagrement). 
        \item All scripts are ready and automatised and tested.
        \item We look to have feedback from the collaboration.
\end{enumerate}
\begin{center}
\includegraphics[width=1.9in]{pr.jpg}
\end{center}
\end{frame}


\begin{frame}\frametitle{~}
  \begin{center}
\begin{Large}
BACKUPS
\end{Large}
\end{center}

\end{frame}
 \begin{frame}\frametitle{Approximate Binned Median Statistics (ABMS)}

 In real analysis if you are looking for upper limit you compare null hypothesis $H_0$ (background-only) with a spectrum of other hypotheses representing different branching fractions. Then you choose one with smaller value of branching fraction that gives enough significance.
 The measure of significance is CLs, that is computed for statistics
 $q$ equal to ratio of likelihoods of your hypotheses.

 For Kaggle to simplify understanding/computation of the metric we do two «tricks»:
 \begin{itemize}

   \item instead of comparing $H_0$ with spectrum of hypothesis we compare it with hypothesis with specific branching fraction (taken from the paper on $\tau\rightarrow \mu\mu\mu$) and
   \item instead of estimation of significance we just calculate statistics $q$ (the better it is, the better significance classifier can provide)
 \end{itemize}

 %From practical point of view it could be shown classifiers with better ROC curve have better ABMS.

 \end{frame}

\begin{frame}
\frametitle{Approximate Binned Median Statistics (ABMS)-2}
For given classifier $g$, the number of events $n$ found in a bin (a region of input variable space), is assumed to follow a Poisson distribution with mean $\mu_s+\mu_b$
$$P(n  | \mu_s, \mu_b) = \frac{(\mu_s + \mu_b)^n}{n!}e^{-(\mu_s+\mu_b)},$$
where $\mu_s$ and $\mu_b$ are the expected numbers of events from the signal and background, respectively. To establish the existence of the signal process, we test the $H_0$ of $\mu_s$ = 0 against the alternative $H_1$ with $\mu_s
>0.$ We will use several bins, and assume that for each bin we have independent parameter $\mu_s$. Thus the likelihood ratio looks like:
\begin{equation}
Q= \prod_{bin} \frac{P(n_i|0, \mu_{b}^i)}{P(n_i | \hat{\mu}_s^i, \mu_{b}^i)} = \prod_{bin}\left(\frac{\mu_b^i}{n_i}\right)^{n_i} e^{n_i - \mu_b^i},
\end{equation}
where $\hat{\mu}_s^i$ is the maximum likelihood estimator of $\mu_s^i$ given that we observe $n_i$ events in the $i$-th bin. $\hat{\mu}_s^i = n_i - \mu_b^i$.
\end{frame}

\begin{frame}\frametitle{Approximate Binned Median Statistics (ABMS)-3}

\begin{equation}
q = -2  \ln Q = 2  \sum_{bin} \left(n_i \ln{\frac{n_i}{\mu_b^i}} - n_i +\mu_b^i\right)
\end{equation}

For empirical estimations we take $\mu_b^i = b_i$, $n_i = s_i + b_i$, where $s_i$ - is estimation of the amount of signal according to the best known upper limit. Then empirical estimation of the statistics is
\begin{equation}
\hat{q}= 2  \sum_{bin} \left((s_i + b_i) \ln{\left(1 + \frac{s_i}{b_i}\right)} - s_i\right)
\end{equation}

Adding regularization term we can define
\begin{equation}
ABMS = \sqrt{\hat{q}_{regularization}} = \sqrt{\sum_{bin} AMS_i^2}
\end{equation}
where $AMS_i$ is calculated for $i-th$ bin in the same way as in HiggsML challenge:
$$AMS = \sqrt{2 ((s + b + b_{reg}) \ln{(1 + s/(b+b_{reg}))} - s)} \cong \sqrt{2 s^2 / (b + b_{reg})}$$
\end{frame}



\end{document}