ucsd-hep-ex · aaronw5 · Sep 8, 2024 · Sep 8, 2024 · Sep 8, 2024
diff --git a/notebooks/backgroundRejection.ipynb b/notebooks/backgroundRejection.ipynb
@@ -1,100 +1,129 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e2401b40-915a-4fd6-8c4f-d81bd9973f93",
+   "metadata": {},
+   "source": [
+    "## Convert root files into numpy array"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "fab67872-f3b8-44ff-b327-1ea3a53c6c71",
+   "execution_count": 27,
+   "id": "4a184fac-8f9c-4c63-bff3-0d230eecd71d",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
+    "import uproot\n",
     "import numpy as np\n",
     "from sklearn.metrics import accuracy_score\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "from sklearn.metrics import roc_curve, auc, roc_auc_score\n",
     "from scipy.special import softmax\n",
-    "\n"
+    "\n",
+    "\n",
+    "# Define the directory path\n",
+    "directory = \"LinformerEval\"\n",
+    "labelList = []\n",
+    "predList = []\n",
+    "# Loop through the files in the directory\n",
+    "for filename in os.listdir(directory):\n",
+    "    if filename.endswith(\".root\"):\n",
+    "        try:\n",
+    "            filepath = os.path.join(directory, filename)\n",
+    "            background = uproot.open(filepath)\n",
+    "            tree = background['Events;1']\n",
+    "            df =tree.arrays(library=\"pd\")\n",
+    "            labels = df[['label_QCD', 'label_Hbb', 'label_Hcc', 'label_Hgg', 'label_H4q',\n",
+    "                         'label_Hqql', 'label_Zqq', 'label_Wqq', 'label_Tbqq', 'label_Tbl']]\n",
+    "            pred = df[['score_label_QCD', 'score_label_Hbb', 'score_label_Hcc', 'score_label_Hgg', 'score_label_H4q',\n",
+    "                         'score_label_Hqql', 'score_label_Zqq', 'score_label_Wqq', 'score_label_Tbqq', 'score_label_Tbl']] \n",
+    "            labelList.append(labels.to_numpy())\n",
+    "            predList.append(pred.to_numpy())\n",
+    "        except:\n",
+    "            continue\n",
+    "        \n",
+    "labels = np.concatenate(labelList).astype(int)\n",
+    "y_prob = np.concatenate(predList)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18a1583c-9c4c-4c59-a824-b89268f4665c",
+   "metadata": {},
+   "source": [
+    "## Find background rejection\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "id": "47b70f51-9661-4b04-a9fa-cc80ab49d046",
+   "execution_count": null,
+   "id": "b8d7f111-e095-4e5a-89f5-e0ada0647b3a",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Rejection at 50.0% for label_Hbb: 4246.284501061571\n",
-      "Rejection at 50.0% for label_Hcc: 1046.0251046025105\n",
-      "Rejection at 50.0% for label_Hgg: 82.05128205128204\n",
-      "Rejection at 50.0% for label_H4q: 416.49312786339027\n",
-      "Rejection at 99.0% for label_Hqql: 570.7762557077625\n",
-      "Rejection at 50.0% for label_Zqq: 189.48365703458077\n",
-      "Rejection at 50.0% for label_Wqq: 210.1281781886951\n",
-      "Rejection at 50.0% for label_Tbqq: 2347.417840375587\n",
-      "Rejection at 99.5% for label_Tbl: 1307.1895424836603\n",
-      "Overall ROC AUC = 0.9357, Accuracy = 0.7969\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "label_list = ['label_QCD' , 'label_Hbb', 'label_Hcc', 'label_Hgg', 'label_H4q', 'label_Hqql', 'label_Zqq', 'label_Wqq', 'label_Tbqq', 'label_Tbl']  # Replace with your actual class labels\n",
     "\n",
     "n_classes = 10 \n",
     "\n",
-    "predictions = np.load('outputs_base.npy')\n",
-    "\n",
-    "labels = np.load('labels_base.npy')\n",
-    "\n",
-    "y_prob = softmax(predictions, axis=1)  \n",
-    "\n",
     "scores = y_prob[:,1:10]/ (y_prob[:,0][:, np.newaxis] + y_prob[:,1:10])\n",
     "\n",
     "scores = np.concatenate((y_prob[:,0].reshape(len(scores),1), scores), axis = 1)\n",
-    "\n",
     "rejections = []\n",
     "\n",
     "for i in range(1, n_classes):  \n",
-    "    percent = 0.5\n",
-    "    \n",
-    "    mask = (labels[:, 0] == 1) | (labels[:, i] == 1)\n",
-    "    filtered_labels = labels[mask]\n",
-    "    filtered_scores = scores[mask]\n",
-    "    \n",
-    "    binary_labels = (filtered_labels[:, i] == 1).astype(int)\n",
-    "    \n",
-    "    binary_scores = filtered_scores[:, i]\n",
+    "    try:\n",
+    "        percent = 0.5\n",
+    "        \n",
+    "        mask = (labels[:, 0] == 1) | (labels[:, i] == 1)\n",
+    "        filtered_labels = labels[mask]\n",
+    "        filtered_scores = scores[mask]\n",
+    "        \n",
+    "        binary_labels = (filtered_labels[:, i] == 1).astype(int)\n",
+    "        \n",
+    "        binary_scores = filtered_scores[:, i]\n",
+    "        \n",
+    "        fpr, tpr, thresholds = roc_curve(binary_labels, binary_scores)\n",
     "    \n",
-    "    fpr, tpr, thresholds = roc_curve(binary_labels, binary_scores)\n",
-    "\n",
-    "    if i == 5:\n",
-    "        percent = 0.99\n",
-    "    if i == 9:\n",
-    "        percent = 0.995\n",
-    "    \n",
-    "    idx = np.abs(tpr - percent).argmin()\n",
-    "    \n",
-    "    if fpr[idx] != 0:\n",
-    "        rejection = 1 / fpr[idx]\n",
-    "    else:\n",
-    "        rejection = np.inf  \n",
-    "    \n",
-    "    rejections.append(rejection)\n",
-    "\n",
-    "    \n",
-    "    print(f'Rejection at {percent*100}% for {label_list[i]}: {rejection}')\n",
+    "        if i == 5:\n",
+    "            percent = 0.99\n",
+    "        if i == 9:\n",
+    "            percent = 0.995\n",
+    "        \n",
+    "        idx = np.abs(tpr - percent).argmin()\n",
+    "        \n",
+    "        if fpr[idx] != 0:\n",
+    "            rejection = 1 / fpr[idx]\n",
+    "        else:\n",
+    "            rejection = np.inf  \n",
+    "        \n",
+    "        rejections.append(rejection)\n",
     "    \n",
+    "        \n",
+    "        print(f'Rejection at {percent*100}% for {label_list[i]}: {rejection}')\n",
+    "    except:\n",
+    "        continue\n",
+    "print(labels.shape)\n",
     "overall_roc_auc = roc_auc_score(labels, scores, average='macro', multi_class='ovo')\n",
     "\n",
-    "predicted_labels = np.argmax(softmax(base, axis=1), axis=1) \n",
+    "predicted_labels = np.argmax(y_prob, axis=1)  \n",
     "true_labels = np.argmax(labels, axis=1)  \n",
     "\n",
     "accuracy = accuracy_score(true_labels, predicted_labels)\n",
     "\n",
     "print(f'Overall ROC AUC = {overall_roc_auc:.4f}, Accuracy = {accuracy:.4f}')\n"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "797b4e11-7951-4762-a4e2-592ce9f8db4f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {