{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import tensorflow as tf\n", "import matplotlib.pyplot as plt\n", "import os\n", "import pickle\n", "import math\n", "\n", "trunc_normal= tf.truncated_normal_initializer(stddev=1)\n", "normal = tf.random_normal_initializer(stddev=1)\n", "\n", "from xgboost import XGBClassifier\n", "from architectures.data_processing import *\n", "from architectures.utils.toolbox import *\n", "from architectures.DNN import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "l_index=0\n", "l_flv=['e','mu']\n", "mother_ID=[\"Ds\",\"Dplus\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# DATA LOADING & PREPROCESSING" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "with open('/disk/lhcb_data/davide/Rphipi/NN_for_selection/'+l_flv[l_index]+l_flv[l_index]+'/'+'data_for_NN_'+l_flv[l_index]+l_flv[l_index]+'.pickle', 'rb') as f:\n", " data_dict=pickle.load(f, encoding='latin1')\n", "data_dict[\"Ds_ENDVERTEX_CHI2\"]=data_dict[\"Ds_ENDVERTEX_CHI2\"]/data_dict[\"Ds_ENDVERTEX_NDOF\"]\n", "data_dict[\"Ds_OWNPV_CHI2\"]=data_dict[\"Ds_OWNPV_CHI2\"]/data_dict[\"Ds_OWNPV_NDOF\"]\n", "data_dict[\"Ds_IPCHI2_OWNPV\"]=data_dict[\"Ds_IPCHI2_OWNPV\"]/data_dict[\"Ds_ENDVERTEX_NDOF\"]\n", "\n", "del data_dict[\"Ds_ENDVERTEX_NDOF\"]\n", "del data_dict[\"Ds_OWNPV_NDOF\"]\n", "\n", "data_dict[\"phi_ENDVERTEX_CHI2\"]=data_dict[\"phi_ENDVERTEX_CHI2\"]/data_dict[\"phi_ENDVERTEX_NDOF\"]\n", "#data_dict[\"phi_OWNPV_CHI2\"]=data_dict[\"phi_OWNPV_CHI2\"]/data_dict[\"phi_OWNPV_NDOF\"]\n", "data_dict[\"phi_IPCHI2_OWNPV\"]=data_dict[\"phi_IPCHI2_OWNPV\"]/data_dict[\"phi_ENDVERTEX_NDOF\"]\n", "\n", "del data_dict[\"phi_ENDVERTEX_NDOF\"]\n", "#del data_dict[\"phi_OWNPV_NDOF\"]\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "task='TEST'\n", "\n", "test=0\n", "i=4\n", "PATH=l_flv[l_index]+'/test_'+str(test)+'/NN_'+str(i)\n", "\n", "with open(PATH+'/variables_used.pkl', 'rb') as f: \n", " branches_needed = pickle.load(f)\n", " \n", "#Number of input features\n", "m=data_dict[\"Ds_ConsD_M\"].shape[0]\n", "branches_needed.pop()\n", "dim=len(branches_needed)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = extract_array(data_dict, branches_needed, dim, m)\n", "\n", "data_mean=data.mean(axis=0)\n", "data_1=data-data_mean\n", "data_std=data_1.std(axis=0)\n", "data_2=data_1/data_std\n", "data_2.std(axis=0)\n", "#data_2=data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# NN SELECTION" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "with open(PATH+'/hyper_parameters.pkl', 'rb') as f: \n", " hyper_dict = pickle.load(f)\n", " #for key, item in hyper_dict.items():\n", " # print(key+':'+str(item))\n", "\n", "k=hyper_dict[\"k\"]\n", "LEARNING_RATE=hyper_dict[\"LEARNING_RATE\"]\n", "BETA1=hyper_dict[\"BETA1\"]\n", "BATCH_SIZE=hyper_dict[\"BATCH_SIZE\"]\n", "EPOCHS=hyper_dict[\"EPOCHS\"]\n", "VAL_PERIOD=hyper_dict[\"VAL_PERIOD\"]\n", "SEED=hyper_dict[\"SEED\"]\n", "sizes=hyper_dict[\"sizes\"]\n", "LAMBD=hyper_dict[\"LAMBD\"]\n", "PATH=hyper_dict[\"PATH\"]\n", " \n", "if not os.path.exists(PATH+'/hyper_parameters.pkl'):\n", " print(\"No saved sizes dict\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Input for propagation (?, 13)\n", "Logits shape (?, 2)\n", "Input for propagation (?, 13)\n", "Logits shape (?, 2)\n" ] } ], "source": [ "tf.reset_default_graph()\n", "model_NN = DNN(dim, sizes,\n", " lr=LEARNING_RATE, beta1=BETA1, lambd=LAMBD,\n", " batch_size=BATCH_SIZE, epochs=EPOCHS,\n", " save_sample=VAL_PERIOD, path=PATH, seed=SEED)\n", "\n", "vars_to_train=tf.trainable_variables()\n", "vars_all = tf.global_variables()\n", "vars_to_init = list(set(vars_all)-set(vars_to_train))\n", "init_op = tf.variables_initializer(vars_to_init)\n", "\n", "saver = tf.train.Saver()\n", "gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.33)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " Evaluate model on test set...\n", "INFO:tensorflow:Restoring parameters from e/test_0/NN_4/NN_model.ckpt\n", "Model restored.\n" ] } ], "source": [ "output_dict_NN={}\n", "batch_size=200\n", "n_batches = data.shape[0]//batch_size\n", "\n", "with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:\n", " sess.run(init_op)\n", " print('\\n Evaluate model on test set...')\n", " saver.restore(sess,PATH+'/NN_model.ckpt')\n", " print('Model restored.')\n", " model_NN.set_session(sess)\n", " for j in range(n_batches):\n", " \n", " small_dataset = data_2[j*batch_size:(j+1)*batch_size]\n", " output_dict_NN[j] = model_NN.predict(small_dataset)\n", " \n", " if data.shape[0]%batch_size != 0:\n", " output_dict_NN[j+1] = model_NN.predict(data_2[(j+1)*batch_size: data_2.shape[0]-1])\n", " \n", " output_NN=np.concatenate([output_dict_NN[i] for i in range(len(output_dict_NN))])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "a = data_dict[\"Ds_ConsD_M\"][0:output_NN.shape[0]][np.argmax(output_NN, axis=1).astype(np.bool)]\n", "b = [data_dict[\"Ds_ConsD_M\"][0:output_NN.shape[0]][i] for i in range(output_NN.shape[0])]\n", "\n", "NN_selected=np.array([a[i][0] for i in range(len(a))])\n", "full = np.array([b[i][0] for i in range(len(b))])\n", "full=np.delete(full,np.where(full<0))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 1152x720 with 1 Axes>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "full_plot=full/1000\n", "NN_selected_plot=NN_selected/1000\n", "plt.hist(full_plot,alpha=0.4,bins=120, range=(full_plot.min(),full_plot.max()),density=True,label='before NN');\n", "plt.hist(NN_selected_plot,alpha=0.5,bins=120, range=(full_plot.min(),full_plot.max()),density=True,label='after NN');\n", "plt.legend(fontsize=20)\n", "\n", "plt.xlabel('D reconstructed mass (GeV)', fontsize=17)\n", "plt.ylabel('# events (a.u)', fontsize=17)\n", "\n", "fig=plt.gcf();\n", "fig.set_size_inches(16,10)\n", "plt.savefig('/home/hep/davide/Rphipi/mu/test_'+str(test)+'/selected_data_NN.png', format='png', dpi=100)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "np.save('/disk/lhcb_data/davide/Rphipi/selected_data/'+l_flv[l_index]+l_flv[l_index]+'/'+'sel_data_NN_'+l_flv[l_index]+l_flv[l_index]+'.npy', NN_selected)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# XGBOOST SELECTION" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "PATH=l_flv[l_index]+'/test_'+str(test)+'/XG_'+str(i)\n", "loaded_model = pickle.load(open(PATH+\"/XG_\"+str(i)+\"_.pickle.dat\", \"rb\"))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "output_XG=loaded_model.predict_proba(data_2)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "a = data_dict[\"Ds_ConsD_M\"][0:output_XG.shape[0]][np.argmax(output_XG, axis=1).astype(np.bool)]\n", "b = [data_dict[\"Ds_ConsD_M\"][0:output_XG.shape[0]][i] for i in range(output_XG.shape[0])]\n", "\n", "XG_selected=np.array([a[i][0] for i in range(len(a))])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "<Figure size 1152x720 with 1 Axes>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "full_plot=full/1000\n", "XG_selected_plot=XG_selected/1000\n", "\n", "plt.hist(full_plot,alpha=0.4,bins=120, range=(full_plot.min(),full_plot.max()),density=True,label='before XG');\n", "plt.hist(XG_selected_plot,alpha=0.5,bins=120, range=(full_plot.min(),full_plot.max()),density=True,label='after XG');\n", "plt.legend(fontsize=20)\n", "plt.xlabel('D reconstructed mass (GeV)', fontsize=17)\n", "plt.ylabel('# events (a.u)', fontsize=17)\n", "fig=plt.gcf();\n", "fig.set_size_inches(16,10)\n", "plt.savefig('/home/hep/davide/Rphipi/mu/test_'+str(test)+'/selected_data_XG.png', format='png', dpi=100)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "np.save('/disk/lhcb_data/davide/Rphipi/selected_data/'+l_flv[l_index]+l_flv[l_index]+'/'+'sel_data_XG_'+l_flv[l_index]+l_flv[l_index]+'.npy', NN_selected)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }