Skip to content

Commit f897e6a

Browse files
authored
Merge pull request #278 from JuliaParallel/jps/logging-overhaul
Logging overhaul and new web dashboard
2 parents 6a1aa1f + 11fb9f6 commit f897e6a

25 files changed

+1940
-183
lines changed

Diff for: .buildkite/pipeline.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
env:
66
JULIA_NUM_THREADS: 1
77
steps:
8-
- label: Julia 1.x
8+
- label: Julia 1.6
99
command: julia --project -e 'using Pkg; Pkg.build(); Pkg.test()'
1010
timeout_in_minutes: 60
1111
<<: *test
1212
plugins:
1313
- JuliaCI/julia#v1:
14-
version: "1.5"
14+
version: "1.6"
1515
- JuliaCI/julia-test#v1:
1616
# - JuliaCI/julia-coverage#v1:
1717
# codecov: true

Diff for: Project.toml

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "Dagger"
22
uuid = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
3-
version = "0.13.1"
3+
version = "0.13.2"
44

55
[deps]
66
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
@@ -24,9 +24,9 @@ Colors = "0.10, 0.11, 0.12"
2424
MemPool = "0.3.4"
2525
Requires = "1"
2626
StatsBase = "0.28, 0.29, 0.30, 0.31, 0.32, 0.33"
27-
Tables = "1.1"
2827
TableOperations = "1"
29-
julia = "1.3"
28+
Tables = "1.1"
29+
julia = "1.6"
3030

3131
[extras]
3232
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

Diff for: appveyor.yml

-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
environment:
22
matrix:
3-
- julia_version: 1.3
43
- julia_version: 1.6
54
- julia_version: nightly
65

Diff for: benchmarks/benchmark.jl

+46-8
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ if render == "live"
3434
const live = true
3535
using Luxor, ProfileSVG
3636
using Mux
37+
elseif render == "webdash"
38+
const live = true
39+
using DaggerWebDash
40+
import DaggerWebDash: LinePlot, GanttPlot, GraphPlot, ProfileViewer
3741
elseif render == "offline"
3842
const live = false
3943
using Luxor, ProfileSVG
@@ -43,6 +47,9 @@ const RENDERS = Dict{Int,Dict}()
4347
const live_port = parse(Int, get(ENV, "BENCHMARK_LIVE_PORT", "8000"))
4448

4549
const graph = parse(Bool, get(ENV, "BENCHMARK_GRAPH", "0"))
50+
if render == "webdash"
51+
@warn "BENCHMARK_GRAPH=1 is not compatible with BENCHMARK_RENDER=webdash; disabling graphing"
52+
end
4653
const profile = parse(Bool, get(ENV, "BENCHMARK_PROFILE", "0"))
4754

4855
_benches = get(ENV, "BENCHMARK", "cpu,cpu+dagger")
@@ -247,20 +254,46 @@ function main()
247254
output_prefix = "result-$(np)workers-$(nt)threads-$(Dates.now())"
248255

249256
suites = Dict()
250-
graph_opts = if graph && render != ""
251-
(log_sink=Dagger.LocalEventLog(), log_file=output_prefix*".dot")
252-
elseif render != ""
253-
(log_sink=Dagger.LocalEventLog(),)
254-
else
255-
NamedTuple()
257+
opts = (;profile=profile)
258+
if render == "live"
259+
opts = merge(opts, (;log_sink=Dagger.LocalEventLog()))
260+
if graph
261+
opts = merge(opts, (;log_file=output_prefix*".dot"))
262+
end
263+
elseif render == "webdash"
264+
ml = Dagger.MultiEventLog()
265+
ml[:core] = Dagger.Events.CoreMetrics()
266+
ml[:id] = Dagger.Events.IDMetrics()
267+
ml[:timeline] = Dagger.Events.TimelineMetrics()
268+
profile && (ml[:profile] = DaggerWebDash.ProfileMetrics())
269+
ml[:wsat] = Dagger.Events.WorkerSaturation()
270+
ml[:loadavg] = Dagger.Events.CPULoadAverages()
271+
ml[:bytes] = Dagger.Events.BytesAllocd()
272+
ml[:mem] = Dagger.Events.MemoryFree()
273+
ml[:esat] = Dagger.Events.EventSaturation()
274+
ml[:psat] = Dagger.Events.ProcessorSaturation()
275+
lw = Dagger.Events.LogWindow(20*10^9, :core)
276+
d3r = DaggerWebDash.D3Renderer(live_port)
277+
push!(d3r, GanttPlot(:core, :id, :timeline, :esat, :psat, "Overview"))
278+
# TODO: push!(d3r, ProfileViewer(:core, :profile, "Profile Viewer"))
279+
push!(d3r, LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks"))
280+
push!(d3r, LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads"))
281+
push!(d3r, LinePlot(:core, :bytes, "Allocated Bytes", "Bytes"))
282+
push!(d3r, LinePlot(:core, :mem, "Available Memory", "% Free"))
283+
push!(d3r, GraphPlot(:core, :id, :timeline, :profile, "DAG"))
284+
push!(lw.creation_handlers, d3r)
285+
push!(lw.deletion_handlers, d3r)
286+
ml.aggregators[:logwindow] = lw
287+
ml.aggregators[:d3r] = d3r
288+
opts = merge(opts, (;log_sink=ml))
256289
end
257-
ctx = Context(collect((1:nw) .+ 1); profile=profile, graph_opts...)
290+
ctx = Context(collect((1:nw) .+ 1); opts...)
258291
for bench in benches
259292
name = bench.name
260293
println("creating $name benchmarks")
261294
suites[name] = nmf_suite(ctx; dagger=bench.dagger, accel=bench.accel)
262295
end
263-
if render != ""
296+
if render == "live" || render == "offline"
264297
Dagger.show_gantt(ctx; width=1800, window_length=5, delay=2, port=live_port, live=live)
265298
if live
266299
# Make sure server code is compiled
@@ -269,6 +302,11 @@ function main()
269302
run(pipeline(`curl -s localhost:$live_port/profile`; stdout=devnull))
270303
@info "Rendering started on port $live_port"
271304
end
305+
elseif render == "webdash"
306+
# Kick the webserver into gear
307+
collect(ctx, delayed(identity)(1))
308+
run(pipeline(`curl -s localhost:$live_port/index.html`; stdout=devnull))
309+
@info "Rendering started on port $live_port"
272310
end
273311
res = Dict()
274312
for bench in benches

Diff for: docs/make.jl

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ makedocs(;
1818
"Scopes" => "scopes.md",
1919
"Dynamic Scheduler Control" => "dynamic.md",
2020
"Logging and Graphing" => "logging.md",
21+
"Scheduler Visualization" => "scheduler-visualization.md",
2122
"Benchmarking" => "benchmarking.md",
2223
"Scheduler Internals" => "scheduler-internals.md",
2324
"Distributed Table" => "dtable.md",

Diff for: docs/src/logging.md

+100-7
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,99 @@
22

33
Dagger's scheduler keeps track of the important and potentially expensive
44
actions it does, such as moving data between workers or executing thunks, and
5-
tracks how much time and memory allocations these operations consume. Saving
6-
this information somewhere accessible is disabled by default, but it's quite
7-
easy to turn it on:
5+
tracks how much time and memory allocations these operations consume, among
6+
other things. Saving this information somewhere accessible is disabled by
7+
default, but it's quite easy to turn it on, by setting a "log sink" in the
8+
`Context` being used, as `ctx.log_sink`. A variety of log sinks are built-in to
9+
Dagger; the `NoOpLog` is the default log sink when one isn't explicitly
10+
specified, and disables logging entirely (to minimize overhead). There are
11+
currently two other log sinks of interest; the first and newer of the two is
12+
the `Dagger.MultiEventLog`, which generates multiple independent log streams,
13+
one per "consumer" (details in the next section). The second and older sink is
14+
the `Dagger.LocalEventLog`, which is explained later in this document. Most
15+
users are recommended to use the `MultiEventLog` since it's far more flexible
16+
and extensible, and is more performant in general.
17+
18+
## MultiEventLog
19+
20+
The `MultiEventLog` is intended to be configurable to exclude unnecessary
21+
information, and to include any built-in or user-defined metrics. It stores a
22+
set of "sub-log" streams internally, appending a single element to each of them
23+
when an event is generated. This element can be called a "sub-event" (to
24+
distinguish it from the higher-level "event" that Dagger creates), and is
25+
created by a "consumer". A consumer is a function or callable struct that, when
26+
called with the `Dagger.Event` object generated by Dagger, returns a sub-event
27+
characterizing whatever information the consumer represents. For example, the
28+
`Dagger.Events.BytesAllocd` consumer calculates the total bytes allocated and
29+
live at any given time within Dagger, and returns the current value when
30+
called. Let's construct one:
31+
32+
```julia
33+
ctx = Context()
34+
ml = Dagger.MultiEventLog()
35+
36+
# Add the BytesAllocd consumer to the log as `:bytes`
37+
ml[:bytes] = Dagger.Events.BytesAllocd()
38+
39+
ctx.log_sink = ml
40+
```
41+
42+
As we can see above, each consumer gets a unique name as a `Symbol` that
43+
identifies it. Now that the log sink is attached with a consumer, we can
44+
execute some Dagger tasks, and then collect the sub-events generated by
45+
`BytesAllocd`:
46+
47+
```julia
48+
# Using the lazy API, for explanatory purposes
49+
collect(ctx, delayed(+)(1, delayed(*)(3, 4))) # Allocates 8 bytes
50+
log = Dagger.get_logs!(ml)[1] # Get the logs for worker 1
51+
@show log[:bytes]
52+
```
53+
54+
You'll then see that 8 bytes are allocated and then freed during the process of
55+
executing and completing those tasks.
56+
57+
Note that the `MultiEventLog` can also be used perfectly well when using
58+
Dagger's eager API:
59+
60+
```julia
61+
ctx = Dagger.Sch.eager_context()
62+
ctx.log_sink = ml
63+
64+
a = Dagger.@spawn 3*4
65+
Dagger.@spawn 1+a
66+
```
67+
68+
There are a variety of other consumers built-in to Dagger, under the
69+
`Dagger.Events` module:
70+
71+
```@docs
72+
Dagger.Events.CoreMetrics
73+
Dagger.Events.IDMetrics
74+
Dagger.Events.TimelineMetrics
75+
Dagger.Events.FullMetrics
76+
Dagger.Events.BytesAllocd
77+
Dagger.Events.CPULoadAverages
78+
Dagger.Events.MemoryFree
79+
Dagger.Events.EventSaturation
80+
Dagger.Events.WorkerSaturation
81+
Dagger.Events.ProcessorSaturation
82+
```
83+
84+
The `MultiEventLog` also has a mechanism to call a set of functions, called
85+
"aggregators", after all consumers have been executed, and are passed the full
86+
set of log streams as a `Dict{Symbol,Vector{Any}}`. The only one currently
87+
shipped with Dagger directly is the `LogWindow`:
88+
89+
```@docs
90+
Dagger.Events.LogWindow
91+
```
92+
93+
## LocalEventLog
94+
95+
The `LocalEventLog` is generally only useful when you want combined events
96+
(event start and finish combined as a single unit), and only care about a few
97+
simple built-in generated events. Let's attach one to our context:
898

999
```julia
10100
ctx = Context()
@@ -13,10 +103,7 @@ ctx.log_sink = log
13103
```
14104

15105
Now anytime `ctx` is used as the context for a scheduler, the scheduler will
16-
log events into `log`. A `LocalEventLog` logs information in-memory, and does
17-
so on each worker. The default log object is a `NoOpLog`, which doesn't store
18-
events at all. The `FilterLog` exists to allow writing events to a
19-
user-defined location (such as a database, file, or network socket).
106+
log events into `log`.
20107

21108
Once sufficient data has been accumulated into a `LocalEventLog`, it can be
22109
gathered to a single host via `Dagger.get_logs!(log)`. The result is a
@@ -48,3 +135,9 @@ which aren't `Thunk`s (such as operations on the `Dagger.DArray`) will be
48135
properly rendered with input arguments (which normally aren't rendered because
49136
a `Thunk` is dynamically generated from such operations by Dagger before
50137
scheduling).
138+
139+
## FilterLog
140+
141+
The `FilterLog` exists to allow writing events to a user-defined location (such
142+
as a database, file, or network socket). It is not currently tested or
143+
documented.

Diff for: docs/src/scheduler-visualization.md

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Scheduler Visualization with DaggerWebDash
2+
3+
When working with Dagger, especially when working with its scheduler, it can be
4+
helpful to visualize what Dagger is doing internally. To assist with this, a
5+
web dashboard is available in the DaggerWebDash.jl package. This web dashboard
6+
uses a web server running within each Dagger worker, along with event logging
7+
information, to expose details about the scheduler. Information like worker and
8+
processor saturation, memory allocations, profiling traces, and much more are
9+
available in easy-to-interpret plots.
10+
11+
Using the dashboard is relatively simple and straightforward; if you run
12+
Dagger's benchmarking script, it's enabled for you automatically if the
13+
`BENCHMARK_RENDER` environment variable is set to `webdash`. This is the
14+
easiest way to get started with the web dashboard for new users.
15+
16+
For manual usage, the following snippet of code will suffice:
17+
18+
```julia
19+
ctx = Context() # or `ctx = Dagger.Sch.eager_context()` for eager API usage
20+
ml = Dagger.MultiEventLog()
21+
22+
## Add some logging events of interest
23+
24+
ml[:core] = Dagger.Events.CoreMetrics()
25+
ml[:id] = Dagger.Events.IDMetrics()
26+
ml[:timeline] = Dagger.Events.TimelineMetrics()
27+
# ...
28+
29+
# (Optional) Enable profile flamegraph generation with ProfileSVG
30+
ml[:profile] = DaggerWebDash.ProfileMetrics()
31+
ctx.profile = true
32+
33+
# Create a LogWindow; necessary for real-time event updates
34+
lw = Dagger.Events.LogWindow(20*10^9, :core)
35+
ml.aggregators[:logwindow] = lw
36+
37+
# Create the D3Renderer server on port 8080
38+
d3r = DaggerWebDash.D3Renderer(8080)
39+
40+
## Add some plots! Rendered top-down in order
41+
42+
# Show an overview of all generated events as a Gantt chart
43+
push!(d3r, GanttPlot(:core, :id, :timeline, :esat, :psat, "Overview"))
44+
45+
# Show various numerical events as line plots over time
46+
push!(d3r, LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks"))
47+
push!(d3r, LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads"))
48+
push!(d3r, LinePlot(:core, :bytes, "Allocated Bytes", "Bytes"))
49+
push!(d3r, LinePlot(:core, :mem, "Available Memory", "% Free"))
50+
51+
# Show a graph rendering of compute tasks and data movement between them
52+
# Note: Profile events are ignored if absent from the log
53+
push!(d3r, GraphPlot(:core, :id, :timeline, :profile, "DAG"))
54+
55+
# TODO: Not yet functional
56+
#push!(d3r, ProfileViewer(:core, :profile, "Profile Viewer"))
57+
58+
# Add the D3Renderer as a consumer of special events generated by LogWindow
59+
push!(lw.creation_handlers, d3r)
60+
push!(lw.deletion_handlers, d3r)
61+
62+
# D3Renderer is also an aggregator
63+
ml.aggregators[:d3r] = d3r
64+
65+
ctx.log_sink = ml
66+
# ... use `ctx`
67+
```
68+
69+
Once the server has started, you can browse to `http://localhost:8080/` (if
70+
running on your local machine) to view the plots in real time. The dashboard
71+
also provides options at the top of the page to control the drawing speed,
72+
enable and disable reading updates from the server (disabling freezes the
73+
display at the current instant), and a selector for which worker to look at. If
74+
the connection to the server is lost for any reason, the dashboard will attempt
75+
to reconnect at 5 second intervals. The dashboard can usually survive restarts
76+
of the server perfectly well, although refreshing the page is usually a good
77+
idea. Informational messages are also logged to the browser console for
78+
debugging.

Diff for: lib/DaggerWebDash/Project.toml

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
name = "DaggerWebDash"
2+
uuid = "cfc5aa84-1a2a-41ab-b391-ede92ecae40c"
3+
authors = ["Julian P Samaroo <[email protected]>"]
4+
version = "0.1.0"
5+
6+
[deps]
7+
Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
8+
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
9+
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
10+
MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
11+
Mux = "a975b10e-0019-58db-a62f-e48ff68538c9"
12+
ProfileSVG = "132c30aa-f267-4189-9183-c8a63c7e05e6"
13+
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
14+
StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
15+
16+
[compat]
17+
Dagger = "0.13.2"
18+
JSON3 = "1"
19+
MemPool = "0.3"
20+
Mux = "0.7"
21+
ProfileSVG = "0.2"
22+
StructTypes = "1"
23+
julia = "1.6"

Diff for: lib/DaggerWebDash/src/DaggerWebDash.jl

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
module DaggerWebDash
2+
3+
using Dagger
4+
5+
include("core.jl")
6+
include("profile.jl")
7+
include("d3.jl")
8+
9+
end

0 commit comments

Comments
 (0)