Skip to content

Commit 2a5f6db

Browse files
ZsailerGitHub Enterprise
authored and
GitHub Enterprise
committed
Hubble metrics to cell latency, execution count, and notebook service calls (jupyter-server#407)
* hubble metrics to measure execution count and latency and all notebook service calls * Bump to 0.21.1 * add metric for a disconnected kernel
1 parent 30b4c5d commit 2a5f6db

File tree

11 files changed

+112
-5
lines changed

11 files changed

+112
-5
lines changed

data_studio_jupyter_extensions/__init__.py

+7
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ def _jupyter_server_extension_points(): # pragma: no cover
3030
from data_studio_jupyter_extensions.extensions.kernel_actions.extension import (
3131
KernelActionsExtension,
3232
)
33+
from data_studio_jupyter_extensions.extensions.kernel_websocket_override.extension import (
34+
KernelWebsocketOverrideExtension,
35+
)
3336

3437
return [
3538
{
@@ -56,4 +59,8 @@ def _jupyter_server_extension_points(): # pragma: no cover
5659
"module": "data_studio_jupyter_extensions.extensions.external_links.extension",
5760
"app": KernelActionsExtension,
5861
},
62+
{
63+
"module": "data_studio_jupyter_extensions.extensions.kernel_websocket_override.extension",
64+
"app": KernelWebsocketOverrideExtension,
65+
},
5966
]
+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.21.0" # pragma: no cover
1+
__version__ = "0.21.1" # pragma: no cover

data_studio_jupyter_extensions/configurables/kernel_monitor.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@ async def poll(self):
160160
# If the kernel is alive, but not communicating,
161161
# raise a helpful message in the status menu and
162162
# shutdown the kernel.
163+
HUBBLE_METRICS.KERNEL_DISCONNECTED.emit()
163164
self._disconnected_state()
164-
await self.kernel_manager.shutdown_kernel()
165165
self._attempt_count = 0
166166

167167
def is_culling_enabled(self) -> bool:

data_studio_jupyter_extensions/configurables/notebook_service.py

+7
Original file line numberDiff line numberDiff line change
@@ -195,23 +195,28 @@ async def get_list_of_kernelspecs_for_notebook(self):
195195
out = await self.fetch("kernelspecs", method="get")
196196
return out
197197

198+
@hubble(HUBBLE_METRICS.NBSERVICE_KERNELSPECS_BY_NAME)
198199
async def get_kernelspec_by_name(self, name):
199200
"""Get the kernelspec by name."""
200201
return await self.fetch("kernelspec", name, method="get")
201202

203+
@hubble(HUBBLE_METRICS.NBSERVICE_START_KERNEL)
202204
async def start_kernel(self, kernelspec_id):
203205
"""Start a kernel."""
204206
data = {"kernelSpecId": kernelspec_id}
205207
return await self.fetch("kernels", method="post", data=data)
206208

209+
@hubble(HUBBLE_METRICS.NBSERVICE_GET_KERNEL_DETAILS)
207210
async def get_kernel_details(self, process_id):
208211
"""Get kernel details by id."""
209212
return await self.fetch(f"/kernels/{process_id}", method="get")
210213

214+
@hubble(HUBBLE_METRICS.NBSERVICE_STOP_KERNEL)
211215
async def stop_kernel(self, process_id):
212216
"""Stop kernel by id."""
213217
return await self.fetch(f"/kernels/{process_id}", method="delete")
214218

219+
@hubble(HUBBLE_METRICS.NBSERVICE_GET_KERNEL_STATUS)
215220
async def get_kernel_status(self, process_id, query_params_dict=None):
216221
"""Get kernel status by id."""
217222
url = f"/kernels/{process_id}/status"
@@ -220,10 +225,12 @@ async def get_kernel_status(self, process_id, query_params_dict=None):
220225
url = f"{url}?{query_params}"
221226
return await self.fetch(url, method="get")
222227

228+
@hubble(HUBBLE_METRICS.NBSERVICE_LIST_KERNELS)
223229
async def list_kernels(self):
224230
"""List all kernels for the given notebook server."""
225231
return await self.fetch("kernels", method="get")
226232

233+
@hubble(HUBBLE_METRICS.NBSERVICE_SHUTDOWN_ALL_KERNELS)
227234
async def shutdown_all_kernels(self):
228235
"""Shutdown all kernels for the given notebook server."""
229236
return await self.fetch("kernels", method="delete")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from data_studio_jupyter_extensions.extensions.kernel_websocket_override.extension import (
2+
KernelWebsocketOverrideExtension,
3+
)
4+
5+
6+
def _jupyter_server_extension_points(): # pragma: no cover
7+
return [
8+
{
9+
"module": "data_studio_jupyter_extensions.extensions.kernel_websocket_override.extension",
10+
"app": KernelWebsocketOverrideExtension,
11+
}
12+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from jupyter_server.extension.application import ExtensionApp
2+
3+
from .handlers import handlers
4+
5+
6+
class KernelWebsocketOverrideExtension(ExtensionApp):
7+
"""Jupyter Server extension that verifies
8+
the health of the server.
9+
"""
10+
11+
name = "kernel_websocket_override"
12+
handlers = handlers
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from jupyter_server.base.zmqhandlers import AuthenticatedZMQStreamHandler
2+
from jupyter_server.services.kernels.handlers import _kernel_id_regex
3+
from jupyter_server.services.kernels.handlers import ZMQChannelsHandler
4+
5+
from data_studio_jupyter_extensions.hubble.metrics import HUBBLE_METRICS
6+
7+
8+
def hubble_execution_time_and_count(msg):
9+
"""Emit the time it takes to run a cell and the execution cound to hubble."""
10+
msg_type = msg.get("msg_type", None)
11+
if msg_type == "execute_reply":
12+
tdelta = msg["header"]["date"] - msg["parent_header"]["date"]
13+
seconds = tdelta.total_seconds()
14+
HUBBLE_METRICS.KERNEL_EXECUTION_LATENCY.emit(value=seconds)
15+
count = msg["content"]["execution_count"]
16+
HUBBLE_METRICS.KERNEL_EXECUTION_COUNT.emit(value=count)
17+
18+
19+
class OverrideZMQChannelsHandler(ZMQChannelsHandler):
20+
"""Adds hubble metrics to Jupyter websocket handler."""
21+
22+
# This is a complete fork of the Jupyter Websocket handler
23+
# method. We don't want to reply on this long time, but should
24+
# work on getting this into Jupyter Telemetry.
25+
def _on_zmq_reply(self, stream, msg_list):
26+
idents, fed_msg_list = self.session.feed_identities(msg_list)
27+
28+
if self.subprotocol == "v1.kernel.websocket.jupyter.org":
29+
msg = {"header": None, "parent_header": None, "content": None}
30+
else:
31+
msg = self.session.deserialize(fed_msg_list)
32+
33+
# This chunk is (the only) DataStudio customization for collecting hubble metrics.
34+
# if this chunk fails, don't stop messages from going through. just log a warning.
35+
try:
36+
hubble_execution_time_and_count(msg)
37+
except Exception as err:
38+
self.log.warning(err)
39+
40+
channel = getattr(stream, "channel", None)
41+
parts = fed_msg_list[1:]
42+
43+
self._on_error(channel, msg, parts)
44+
45+
if self._limit_rate(channel, msg, parts):
46+
return
47+
48+
if self.subprotocol == "v1.kernel.websocket.jupyter.org":
49+
AuthenticatedZMQStreamHandler._on_zmq_reply(self, stream, parts)
50+
else:
51+
AuthenticatedZMQStreamHandler._on_zmq_reply(self, stream, msg)
52+
53+
54+
handlers = [
55+
(
56+
rf"/api/kernels/{_kernel_id_regex}/channels",
57+
OverrideZMQChannelsHandler,
58+
),
59+
]

data_studio_jupyter_extensions/hubble/metrics.py

+10
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ class _HUBBLE_METRICS:
2222
KERNEL_POD_LAUNCH_TIME = Metric("kernel_pod_launch_time")
2323
# Compute how long it takes for a kernel pod + first heartbeat to be established
2424
KERNEL_READY_LATENCY = Metric("kernel_ready_latency")
25+
KERNEL_EXECUTION_COUNT = Metric("kernel_execution_count")
26+
KERNEL_EXECUTION_LATENCY = Metric("kernel_execution_latency")
27+
KERNEL_DISCONNECTED = Metric("kernel_disconnected")
2528

2629
# Hubble metrics from Jupyter Server handlers.
2730
API_KERNELS_ROOT = HandlerMetric("/api/kernels")
@@ -43,6 +46,13 @@ class _HUBBLE_METRICS:
4346
# Hubble metrics from Notebook Service responses.
4447
NBSERVICE_KERNELSPECS = ResponseMetric("/nbservice/kernelspecs")
4548
NBSERVICE_LINKS = ResponseMetric("/nbservice/links")
49+
NBSERVICE_KERNELSPECS_BY_NAME = ResponseMetric("/nbservice/kernelspecs/*")
50+
NBSERVICE_START_KERNEL = ResponseMetric("/nbservice/kernels")
51+
NBSERVICE_GET_KERNEL_DETAILS = ResponseMetric("/nbservice/kernels/*")
52+
NBSERVICE_STOP_KERNEL = ResponseMetric("/nbservice/kernels/*")
53+
NBSERVICE_GET_KERNEL_STATUS = ResponseMetric("/nbservice/kernels/*/status")
54+
NBSERVICE_LIST_KERNELS = ResponseMetric("/nbservice/kernels")
55+
NBSERVICE_SHUTDOWN_ALL_KERNELS = ResponseMetric("/nbservice/kernels")
4656

4757

4858
# Instantiate the Hubble metrics dataclass for reference

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ testpaths = [
2323
]
2424

2525
[tool.tbump.version]
26-
current = "0.21.0"
26+
current = "0.21.1"
2727
regex = '''
2828
(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)
2929
((?P<channel>a|b|rc|.dev)(?P<release>\d+))?

src/status.tsx

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import { ISessionContext, ReactWidget } from '@jupyterlab/apputils';
3131
import { EventListener } from './eventlistener';
3232

3333
// This is managed by tbump config in pyproject.toml
34-
const VERSION = '0.21.0';
34+
const VERSION = '0.21.1';
3535

3636
// Define the error states
3737
// https://github.pie.apple.com/pie-data-studio/notebook-service/blob/761d63604966db5918d2e491c0f89cce454b7f67/app/com/apple/datastudio/model/ResourceState.scala#L20

version

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.21.0
1+
0.21.1

0 commit comments

Comments
 (0)