forked from pytorch/FBGEMM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfbgemm_gpu_benchmarks.bash
98 lines (83 loc) · 2.8 KB
/
fbgemm_gpu_benchmarks.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
# shellcheck disable=SC1091,SC2128
. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
################################################################################
# FBGEMM_GPU Test Helper Functions
################################################################################
run_tbe_microbench () {
local env_name="$1"
__single_run() {
local cache_type="$1"
local embedding_location="$2"
echo "################################################################################"
echo "# Running Benchmark: (${cache_type}, ${embedding_location})"
echo "#"
echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
echo "################################################################################"
echo ""
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")
if [ "$embedding_location" == "hbm" ]; then
local managed="device"
elif [ "$embedding_location" == "uvm" ]; then
local managed="managed"
fi
# Old TBE benchmark script
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} python tbe/split_table_batched_embeddings_benchmark.py device \
--batch-size 13107 \
--embedding-dim 256 \
--iters 400 \
--warmup-runs 50 \
--alpha 1.15 \
--bag-size 55 \
--weights-precision fp16 \
--cache-precision "${cache_type}" \
--output-dtype fp16 \
--managed="${managed}" \
--num-embeddings 10000000 \
--num-tables 1 \
--row-wise
# New TBE benchmark script
#
# Invoke `python tbe/tbe_training_benchmark.py device --help` for
# documentation on all available flags
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} python tbe/tbe_training_benchmark.py device \
--bench-iterations 400 \
--bench-warmup-iterations 50 \
--bench-num-requests 10 \
--tbe-batch-size 13107 \
--tbe-embedding-dim 256 \
--tbe-pooling-size 55 \
--tbe-num-embeddings 10000000 \
--tbe-num-tables 1 \
--emb-weights-dtype fp16 \
--emb-cache-dtype "${cache_type}" \
--emb-output-dtype fp16 \
--emb-location "${managed}" \
--row-wise
}
pushd fbgemm_gpu/bench || return 1
local cache_types=(
# fp16
fp32
)
local embedding_locations=(
# uvm
hbm
)
for cache_type in "${cache_types[@]}"; do
for embedding_location in "${embedding_locations[@]}"; do
__single_run "${cache_type}" "${embedding_location}" || return 1
echo ""
echo ""
done
done
popd || return 1
}