From 189633448c9077f9fff58f9f8167f9dd67e23572 Mon Sep 17 00:00:00 2001 From: aaronw5 <139774687+aaronw5@users.noreply.github.com> Date: Sun, 8 Sep 2024 02:02:23 -0500 Subject: [PATCH 1/3] Add files via upload --- notebooks/backgroundRejection.ipynb | 155 +++++++++++++++++----------- 1 file changed, 97 insertions(+), 58 deletions(-) diff --git a/notebooks/backgroundRejection.ipynb b/notebooks/backgroundRejection.ipynb index c5814c1..e35fdd2 100644 --- a/notebooks/backgroundRejection.ipynb +++ b/notebooks/backgroundRejection.ipynb @@ -1,100 +1,139 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "e2401b40-915a-4fd6-8c4f-d81bd9973f93", + "metadata": {}, + "source": [ + "## Convert root files into numpy array" + ] + }, { "cell_type": "code", - "execution_count": 1, - "id": "fab67872-f3b8-44ff-b327-1ea3a53c6c71", + "execution_count": 27, + "id": "4a184fac-8f9c-4c63-bff3-0d230eecd71d", "metadata": {}, "outputs": [], "source": [ + "import os\n", + "import uproot\n", "import numpy as np\n", "from sklearn.metrics import accuracy_score\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from sklearn.metrics import roc_curve, auc, roc_auc_score\n", "from scipy.special import softmax\n", - "\n" + "\n", + "\n", + "# Define the directory path\n", + "directory = \"LinformerEval\"\n", + "labelList = []\n", + "predList = []\n", + "# Loop through the files in the directory\n", + "for filename in os.listdir(directory):\n", + " if filename.endswith(\".root\"):\n", + " try:\n", + " filepath = os.path.join(directory, filename)\n", + " background = uproot.open(filepath)\n", + " tree = background['Events;1']\n", + " df =tree.arrays(library=\"pd\")\n", + " labels = df[['label_QCD', 'label_Hbb', 'label_Hcc', 'label_Hgg', 'label_H4q',\n", + " 'label_Hqql', 'label_Zqq', 'label_Wqq', 'label_Tbqq', 'label_Tbl']]\n", + " pred = df[['score_label_QCD', 'score_label_Hbb', 'score_label_Hcc', 'score_label_Hgg', 'score_label_H4q',\n", + " 'score_label_Hqql', 'score_label_Zqq', 'score_label_Wqq', 'score_label_Tbqq', 'score_label_Tbl']] \n", + " labelList.append(labels.to_numpy())\n", + " predList.append(pred.to_numpy())\n", + " except:\n", + " continue\n", + " \n", + "labels = np.concatenate(labelList).astype(int)\n", + "y_prob = np.concatenate(predList)" + ] + }, + { + "cell_type": "markdown", + "id": "18a1583c-9c4c-4c59-a824-b89268f4665c", + "metadata": {}, + "source": [ + "## Find background rejection\n" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "47b70f51-9661-4b04-a9fa-cc80ab49d046", + "execution_count": null, + "id": "b8d7f111-e095-4e5a-89f5-e0ada0647b3a", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rejection at 50.0% for label_Hbb: 4246.284501061571\n", - "Rejection at 50.0% for label_Hcc: 1046.0251046025105\n", - "Rejection at 50.0% for label_Hgg: 82.05128205128204\n", - "Rejection at 50.0% for label_H4q: 416.49312786339027\n", - "Rejection at 99.0% for label_Hqql: 570.7762557077625\n", - "Rejection at 50.0% for label_Zqq: 189.48365703458077\n", - "Rejection at 50.0% for label_Wqq: 210.1281781886951\n", - "Rejection at 50.0% for label_Tbqq: 2347.417840375587\n", - "Rejection at 99.5% for label_Tbl: 1307.1895424836603\n", - "Overall ROC AUC = 0.9357, Accuracy = 0.7969\n" - ] - } - ], + "outputs": [], "source": [ "label_list = ['label_QCD' , 'label_Hbb', 'label_Hcc', 'label_Hgg', 'label_H4q', 'label_Hqql', 'label_Zqq', 'label_Wqq', 'label_Tbqq', 'label_Tbl'] # Replace with your actual class labels\n", "\n", "n_classes = 10 \n", "\n", - "predictions = np.load('outputs_base.npy')\n", - "\n", - "labels = np.load('labels_base.npy')\n", - "\n", - "y_prob = softmax(predictions, axis=1) \n", - "\n", "scores = y_prob[:,1:10]/ (y_prob[:,0][:, np.newaxis] + y_prob[:,1:10])\n", "\n", "scores = np.concatenate((y_prob[:,0].reshape(len(scores),1), scores), axis = 1)\n", - "\n", "rejections = []\n", "\n", "for i in range(1, n_classes): \n", - " percent = 0.5\n", - " \n", - " mask = (labels[:, 0] == 1) | (labels[:, i] == 1)\n", - " filtered_labels = labels[mask]\n", - " filtered_scores = scores[mask]\n", - " \n", - " binary_labels = (filtered_labels[:, i] == 1).astype(int)\n", + " try:\n", + " percent = 0.5\n", + " \n", + " mask = (labels[:, 0] == 1) | (labels[:, i] == 1)\n", + " filtered_labels = labels[mask]\n", + " filtered_scores = scores[mask]\n", + " \n", + " binary_labels = (filtered_labels[:, i] == 1).astype(int)\n", + " \n", + " binary_scores = filtered_scores[:, i]\n", + " \n", + " fpr, tpr, thresholds = roc_curve(binary_labels, binary_scores)\n", " \n", - " binary_scores = filtered_scores[:, i]\n", - " \n", - " fpr, tpr, thresholds = roc_curve(binary_labels, binary_scores)\n", - "\n", - " if i == 5:\n", - " percent = 0.99\n", - " if i == 9:\n", - " percent = 0.995\n", - " \n", - " idx = np.abs(tpr - percent).argmin()\n", - " \n", - " if fpr[idx] != 0:\n", - " rejection = 1 / fpr[idx]\n", - " else:\n", - " rejection = np.inf \n", - " \n", - " rejections.append(rejection)\n", - "\n", - " \n", - " print(f'Rejection at {percent*100}% for {label_list[i]}: {rejection}')\n", + " if i == 5:\n", + " percent = 0.99\n", + " if i == 9:\n", + " percent = 0.995\n", + " \n", + " idx = np.abs(tpr - percent).argmin()\n", + " \n", + " if fpr[idx] != 0:\n", + " rejection = 1 / fpr[idx]\n", + " else:\n", + " rejection = np.inf \n", + " \n", + " rejections.append(rejection)\n", " \n", + " \n", + " print(f'Rejection at {percent*100}% for {label_list[i]}: {rejection}')\n", + " except:\n", + " continue\n", + "print(labels.shape)\n", "overall_roc_auc = roc_auc_score(labels, scores, average='macro', multi_class='ovo')\n", "\n", - "predicted_labels = np.argmax(softmax(base, axis=1), axis=1) \n", + "predicted_labels = np.argmax(y_prob, axis=1) \n", "true_labels = np.argmax(labels, axis=1) \n", "\n", "accuracy = accuracy_score(true_labels, predicted_labels)\n", "\n", "print(f'Overall ROC AUC = {overall_roc_auc:.4f}, Accuracy = {accuracy:.4f}')\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "daa36b99-66ff-4900-b9c9-1ff9fa045e7e", + "metadata": {}, + "outputs": [], + "source": [ + "np.sum(labels, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "797b4e11-7951-4762-a4e2-592ce9f8db4f", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 3cffa5422fd0e2c3d186e2f26390efe49ce60aef Mon Sep 17 00:00:00 2001 From: aaronw5 <139774687+aaronw5@users.noreply.github.com> Date: Sun, 8 Sep 2024 02:03:46 -0500 Subject: [PATCH 2/3] Updating background rejection notebook for .root files From b4069d777240473da8c9b96b416f58addd0f17eb Mon Sep 17 00:00:00 2001 From: aaronw5 <139774687+aaronw5@users.noreply.github.com> Date: Sun, 8 Sep 2024 02:29:14 -0500 Subject: [PATCH 3/3] Adding background rejection with output root files --- notebooks/backgroundRejection.ipynb | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/notebooks/backgroundRejection.ipynb b/notebooks/backgroundRejection.ipynb index e35fdd2..835306f 100644 --- a/notebooks/backgroundRejection.ipynb +++ b/notebooks/backgroundRejection.ipynb @@ -117,16 +117,6 @@ "print(f'Overall ROC AUC = {overall_roc_auc:.4f}, Accuracy = {accuracy:.4f}')\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "daa36b99-66ff-4900-b9c9-1ff9fa045e7e", - "metadata": {}, - "outputs": [], - "source": [ - "np.sum(labels, axis=0)" - ] - }, { "cell_type": "code", "execution_count": null,