Newer
Older
R_phipi / BDT_select.ipynb
@Davide Lancierini Davide Lancierini on 23 Oct 2018 54 KB Debugging
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import pickle\n",
    "import math\n",
    "\n",
    "trunc_normal= tf.truncated_normal_initializer(stddev=1)\n",
    "normal = tf.random_normal_initializer(stddev=1)\n",
    "\n",
    "from xgboost import XGBClassifier\n",
    "from architectures.data_processing import *\n",
    "from architectures.utils.toolbox import *\n",
    "from architectures.DNN import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_index=0\n",
    "l_flv=['e','mu']\n",
    "mother_ID=[\"Ds\",\"Dplus\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DATA LOADING & PREPROCESSING"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/disk/lhcb_data/davide/Rphipi/NN_for_selection/'+l_flv[l_index]+l_flv[l_index]+'/'+'data_for_NN_'+l_flv[l_index]+l_flv[l_index]+'.pickle', 'rb') as f:\n",
    "    data_dict=pickle.load(f, encoding='latin1')\n",
    "data_dict[\"Ds_ENDVERTEX_CHI2\"]=data_dict[\"Ds_ENDVERTEX_CHI2\"]/data_dict[\"Ds_ENDVERTEX_NDOF\"]\n",
    "data_dict[\"Ds_OWNPV_CHI2\"]=data_dict[\"Ds_OWNPV_CHI2\"]/data_dict[\"Ds_OWNPV_NDOF\"]\n",
    "data_dict[\"Ds_IPCHI2_OWNPV\"]=data_dict[\"Ds_IPCHI2_OWNPV\"]/data_dict[\"Ds_ENDVERTEX_NDOF\"]\n",
    "\n",
    "del data_dict[\"Ds_ENDVERTEX_NDOF\"]\n",
    "del data_dict[\"Ds_OWNPV_NDOF\"]\n",
    "\n",
    "data_dict[\"phi_ENDVERTEX_CHI2\"]=data_dict[\"phi_ENDVERTEX_CHI2\"]/data_dict[\"phi_ENDVERTEX_NDOF\"]\n",
    "#data_dict[\"phi_OWNPV_CHI2\"]=data_dict[\"phi_OWNPV_CHI2\"]/data_dict[\"phi_OWNPV_NDOF\"]\n",
    "data_dict[\"phi_IPCHI2_OWNPV\"]=data_dict[\"phi_IPCHI2_OWNPV\"]/data_dict[\"phi_ENDVERTEX_NDOF\"]\n",
    "\n",
    "del data_dict[\"phi_ENDVERTEX_NDOF\"]\n",
    "#del data_dict[\"phi_OWNPV_NDOF\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "task='TEST'\n",
    "\n",
    "test=0\n",
    "i=4\n",
    "PATH=l_flv[l_index]+'/test_'+str(test)+'/NN_'+str(i)\n",
    "\n",
    "with open(PATH+'/variables_used.pkl', 'rb') as f:  \n",
    "        branches_needed = pickle.load(f)\n",
    "        \n",
    "#Number of input features\n",
    "m=data_dict[\"Ds_ConsD_M\"].shape[0]\n",
    "branches_needed.pop()\n",
    "dim=len(branches_needed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = extract_array(data_dict, branches_needed, dim, m)\n",
    "\n",
    "data_mean=data.mean(axis=0)\n",
    "data_1=data-data_mean\n",
    "data_std=data_1.std(axis=0)\n",
    "data_2=data_1/data_std\n",
    "data_2.std(axis=0)\n",
    "#data_2=data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NN SELECTION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(PATH+'/hyper_parameters.pkl', 'rb') as f:  \n",
    "    hyper_dict = pickle.load(f)\n",
    "    #for key, item in hyper_dict.items():\n",
    "    #    print(key+':'+str(item))\n",
    "\n",
    "k=hyper_dict[\"k\"]\n",
    "LEARNING_RATE=hyper_dict[\"LEARNING_RATE\"]\n",
    "BETA1=hyper_dict[\"BETA1\"]\n",
    "BATCH_SIZE=hyper_dict[\"BATCH_SIZE\"]\n",
    "EPOCHS=hyper_dict[\"EPOCHS\"]\n",
    "VAL_PERIOD=hyper_dict[\"VAL_PERIOD\"]\n",
    "SEED=hyper_dict[\"SEED\"]\n",
    "sizes=hyper_dict[\"sizes\"]\n",
    "LAMBD=hyper_dict[\"LAMBD\"]\n",
    "PATH=hyper_dict[\"PATH\"]\n",
    "    \n",
    "if not os.path.exists(PATH+'/hyper_parameters.pkl'):\n",
    "    print(\"No saved sizes dict\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input for propagation (?, 13)\n",
      "Logits shape (?, 2)\n",
      "Input for propagation (?, 13)\n",
      "Logits shape (?, 2)\n"
     ]
    }
   ],
   "source": [
    "tf.reset_default_graph()\n",
    "model_NN = DNN(dim, sizes,\n",
    "              lr=LEARNING_RATE, beta1=BETA1, lambd=LAMBD,\n",
    "              batch_size=BATCH_SIZE, epochs=EPOCHS,\n",
    "              save_sample=VAL_PERIOD, path=PATH, seed=SEED)\n",
    "\n",
    "vars_to_train=tf.trainable_variables()\n",
    "vars_all = tf.global_variables()\n",
    "vars_to_init = list(set(vars_all)-set(vars_to_train))\n",
    "init_op = tf.variables_initializer(vars_to_init)\n",
    "\n",
    "saver = tf.train.Saver()\n",
    "gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " Evaluate model on test set...\n",
      "INFO:tensorflow:Restoring parameters from e/test_0/NN_4/NN_model.ckpt\n",
      "Model restored.\n"
     ]
    }
   ],
   "source": [
    "output_dict_NN={}\n",
    "batch_size=200\n",
    "n_batches = data.shape[0]//batch_size\n",
    "\n",
    "with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:\n",
    "    sess.run(init_op)\n",
    "    print('\\n Evaluate model on test set...')\n",
    "    saver.restore(sess,PATH+'/NN_model.ckpt')\n",
    "    print('Model restored.')\n",
    "    model_NN.set_session(sess)\n",
    "    for j in range(n_batches):\n",
    "        \n",
    "        small_dataset = data_2[j*batch_size:(j+1)*batch_size]\n",
    "        output_dict_NN[j] = model_NN.predict(small_dataset)\n",
    "        \n",
    "    if data.shape[0]%batch_size != 0:\n",
    "        output_dict_NN[j+1] = model_NN.predict(data_2[(j+1)*batch_size: data_2.shape[0]-1])\n",
    "        \n",
    "    output_NN=np.concatenate([output_dict_NN[i] for i in range(len(output_dict_NN))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = data_dict[\"Ds_ConsD_M\"][0:output_NN.shape[0]][np.argmax(output_NN, axis=1).astype(np.bool)]\n",
    "b = [data_dict[\"Ds_ConsD_M\"][0:output_NN.shape[0]][i] for i in range(output_NN.shape[0])]\n",
    "\n",
    "NN_selected=np.array([a[i][0] for i in range(len(a))])\n",
    "full = np.array([b[i][0] for i in range(len(b))])\n",
    "full=np.delete(full,np.where(full<0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 1152x720 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "full_plot=full/1000\n",
    "NN_selected_plot=NN_selected/1000\n",
    "plt.hist(full_plot,alpha=0.4,bins=120, range=(full_plot.min(),full_plot.max()),density=True,label='before NN');\n",
    "plt.hist(NN_selected_plot,alpha=0.5,bins=120, range=(full_plot.min(),full_plot.max()),density=True,label='after NN');\n",
    "plt.legend(fontsize=20)\n",
    "\n",
    "plt.xlabel('D reconstructed mass (GeV)', fontsize=17)\n",
    "plt.ylabel('# events (a.u)', fontsize=17)\n",
    "\n",
    "fig=plt.gcf();\n",
    "fig.set_size_inches(16,10)\n",
    "plt.savefig('/home/hep/davide/Rphipi/mu/test_'+str(test)+'/selected_data_NN.png', format='png', dpi=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('/disk/lhcb_data/davide/Rphipi/selected_data/'+l_flv[l_index]+l_flv[l_index]+'/'+'sel_data_NN_'+l_flv[l_index]+l_flv[l_index]+'.npy', NN_selected)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# XGBOOST SELECTION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH=l_flv[l_index]+'/test_'+str(test)+'/XG_'+str(i)\n",
    "loaded_model = pickle.load(open(PATH+\"/XG_\"+str(i)+\"_.pickle.dat\", \"rb\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_XG=loaded_model.predict_proba(data_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = data_dict[\"Ds_ConsD_M\"][0:output_XG.shape[0]][np.argmax(output_XG, axis=1).astype(np.bool)]\n",
    "b = [data_dict[\"Ds_ConsD_M\"][0:output_XG.shape[0]][i] for i in range(output_XG.shape[0])]\n",
    "\n",
    "XG_selected=np.array([a[i][0] for i in range(len(a))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 1152x720 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "\n",
    "full_plot=full/1000\n",
    "XG_selected_plot=XG_selected/1000\n",
    "\n",
    "plt.hist(full_plot,alpha=0.4,bins=120, range=(full_plot.min(),full_plot.max()),density=True,label='before XG');\n",
    "plt.hist(XG_selected_plot,alpha=0.5,bins=120, range=(full_plot.min(),full_plot.max()),density=True,label='after XG');\n",
    "plt.legend(fontsize=20)\n",
    "plt.xlabel('D reconstructed mass (GeV)', fontsize=17)\n",
    "plt.ylabel('# events (a.u)', fontsize=17)\n",
    "fig=plt.gcf();\n",
    "fig.set_size_inches(16,10)\n",
    "plt.savefig('/home/hep/davide/Rphipi/mu/test_'+str(test)+'/selected_data_XG.png', format='png', dpi=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('/disk/lhcb_data/davide/Rphipi/selected_data/'+l_flv[l_index]+l_flv[l_index]+'/'+'sel_data_XG_'+l_flv[l_index]+l_flv[l_index]+'.npy', NN_selected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}