Skip to content

Commit 021efbd

Browse files
Prometheus exporter for TF metrics.
Metrics are exported via HTTP server on a new endpoint (defaults to `/monitoring/prometheus/metrics`) that Prometheus can use to scrape from. This change adds: o `MonitoringConfig` proto to configure monitoring. o `--monitoring_config_file` command line flag to pass config. PiperOrigin-RevId: 212695523
1 parent d859d94 commit 021efbd

15 files changed

+539
-5
lines changed

tensorflow_serving/config/BUILD

+9
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,15 @@ serving_proto_library_py(
8888
],
8989
)
9090

91+
serving_proto_library(
92+
name = "monitoring_config_proto",
93+
srcs = ["monitoring_config.proto"],
94+
cc_api_version = 2,
95+
java_api_version = 2,
96+
deps = [
97+
],
98+
)
99+
91100
serving_proto_library(
92101
name = "ssl_config_proto",
93102
srcs = ["ssl_config.proto"],
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
syntax = "proto3";
2+
3+
package tensorflow.serving;
4+
option cc_enable_arenas = true;
5+
6+
// Configuration for Prometheus monitoring.
7+
message PrometheusConfig {
8+
// Whether to expose Prometheus metrics.
9+
bool enable = 1;
10+
11+
// The endpoint to expose Prometheus metrics.
12+
// If not specified, PrometheusExporter::kPrometheusPath value is used.
13+
string path = 2;
14+
}
15+
16+
// Configuration for monitoring.
17+
message MonitoringConfig {
18+
PrometheusConfig prometheus_config = 1;
19+
}

tensorflow_serving/model_servers/BUILD

+4-1
Original file line numberDiff line numberDiff line change
@@ -243,10 +243,11 @@ cc_library(
243243
deps = [
244244
":http_rest_api_handler",
245245
":server_core",
246+
"//tensorflow_serving/config:monitoring_config_proto",
247+
"//tensorflow_serving/util:prometheus_exporter",
246248
"//tensorflow_serving/util:threadpool_executor",
247249
"//tensorflow_serving/util/net_http/server/public:http_server",
248250
"//tensorflow_serving/util/net_http/server/public:http_server_api",
249-
"@com_google_absl//absl/memory",
250251
"@com_google_absl//absl/strings",
251252
"@com_googlesource_code_re2//:re2",
252253
"@org_tensorflow//tensorflow/core:lib",
@@ -318,6 +319,7 @@ cc_library(
318319
"@com_google_absl//absl/memory",
319320
"@org_tensorflow//tensorflow/core:protos_all_cc",
320321
"//tensorflow_serving/config:model_server_config_proto",
322+
"//tensorflow_serving/config:monitoring_config_proto",
321323
"//tensorflow_serving/config:ssl_config_proto",
322324
"//tensorflow_serving/core:availability_preserving_policy",
323325
"//tensorflow_serving/servables/tensorflow:session_bundle_config_proto",
@@ -366,6 +368,7 @@ py_test(
366368
"//tensorflow_serving/servables/tensorflow/testdata:half_plus_two/00000123/export.data-00000-of-00001",
367369
"//tensorflow_serving/servables/tensorflow/testdata:half_plus_two/00000123/export.index",
368370
"//tensorflow_serving/servables/tensorflow/testdata:half_plus_two/00000123/export.meta",
371+
"//tensorflow_serving/servables/tensorflow/testdata:monitoring_config.txt",
369372
"//tensorflow_serving/servables/tensorflow/testdata:saved_model_half_plus_three/00000123/assets/foo.txt",
370373
"//tensorflow_serving/servables/tensorflow/testdata:saved_model_half_plus_three/00000123/saved_model.pb",
371374
"//tensorflow_serving/servables/tensorflow/testdata:saved_model_half_plus_three/00000123/variables/variables.data-00000-of-00001",

tensorflow_serving/model_servers/http_server.cc

+48-1
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@ limitations under the License.
1414
==============================================================================*/
1515

1616
#include <cstdint>
17+
#include <memory>
1718

1819
#include "absl/strings/str_cat.h"
20+
#include "absl/strings/str_format.h"
1921
#include "absl/strings/string_view.h"
2022
#include "re2/re2.h"
2123
#include "tensorflow/core/platform/env.h"
@@ -26,6 +28,7 @@ limitations under the License.
2628
#include "tensorflow_serving/util/net_http/server/public/httpserver.h"
2729
#include "tensorflow_serving/util/net_http/server/public/response_code_enum.h"
2830
#include "tensorflow_serving/util/net_http/server/public/server_request_interface.h"
31+
#include "tensorflow_serving/util/prometheus_exporter.h"
2932
#include "tensorflow_serving/util/threadpool_executor.h"
3033

3134
namespace tensorflow {
@@ -79,6 +82,35 @@ net_http::HTTPStatusCode ToHTTPStatusCode(const Status& status) {
7982
}
8083
}
8184

85+
void ProcessPrometheusRequest(PrometheusExporter* exporter,
86+
const PrometheusConfig& prometheus_config,
87+
net_http::ServerRequestInterface* req) {
88+
std::vector<std::pair<string, string>> headers;
89+
headers.push_back({"Content-Type", "text/plain"});
90+
string output;
91+
Status status;
92+
// Check if url matches the path.
93+
if (req->uri_path() != prometheus_config.path()) {
94+
output = absl::StrFormat("Unexpected path: %s. Should be %s",
95+
req->uri_path(), prometheus_config.path());
96+
status = Status(error::Code::INVALID_ARGUMENT, output);
97+
} else {
98+
status = exporter->GeneratePage(&output);
99+
}
100+
const net_http::HTTPStatusCode http_status = ToHTTPStatusCode(status);
101+
// Note: we add headers+output for non successful status too, in case the
102+
// output contains details about the error (e.g. error messages).
103+
for (const auto& kv : headers) {
104+
req->OverwriteResponseHeader(kv.first, kv.second);
105+
}
106+
req->WriteResponseString(output);
107+
if (http_status != net_http::HTTPStatusCode::OK) {
108+
VLOG(1) << "Error Processing prometheus metrics request. Error: "
109+
<< status.ToString();
110+
}
111+
req->ReplyWithStatus(http_status);
112+
}
113+
82114
class RequestExecutor final : public net_http::EventExecutor {
83115
public:
84116
explicit RequestExecutor(int num_threads)
@@ -147,7 +179,8 @@ class RestApiRequestDispatcher {
147179
} // namespace
148180

149181
std::unique_ptr<net_http::HTTPServerInterface> CreateAndStartHttpServer(
150-
int port, int num_threads, int timeout_in_ms, ServerCore* core) {
182+
int port, int num_threads, int timeout_in_ms,
183+
const MonitoringConfig& monitoring_config, ServerCore* core) {
151184
auto options = absl::make_unique<net_http::ServerOptions>();
152185
options->AddPort(static_cast<uint32_t>(port));
153186
options->SetExecutor(absl::make_unique<RequestExecutor>(num_threads));
@@ -157,6 +190,20 @@ std::unique_ptr<net_http::HTTPServerInterface> CreateAndStartHttpServer(
157190
return nullptr;
158191
}
159192

193+
// Register handler for prometheus metric endpoint.
194+
if (monitoring_config.prometheus_config().enable()) {
195+
std::shared_ptr<PrometheusExporter> exporter =
196+
std::make_shared<PrometheusExporter>();
197+
net_http::RequestHandlerOptions prometheus_request_options;
198+
PrometheusConfig prometheus_config = monitoring_config.prometheus_config();
199+
server->RegisterRequestHandler(
200+
monitoring_config.prometheus_config().path(),
201+
[exporter, prometheus_config](net_http::ServerRequestInterface* req) {
202+
ProcessPrometheusRequest(exporter.get(), prometheus_config, req);
203+
},
204+
prometheus_request_options);
205+
}
206+
160207
std::shared_ptr<RestApiRequestDispatcher> dispatcher =
161208
std::make_shared<RestApiRequestDispatcher>(timeout_in_ms, core);
162209
net_http::RequestHandlerOptions handler_options;

tensorflow_serving/model_servers/http_server.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717

1818
#include <memory>
1919

20+
#include "tensorflow_serving/config/monitoring_config.pb.h"
2021
#include "tensorflow_serving/util/net_http/server/public/httpserver_interface.h"
2122

2223
namespace tensorflow {
@@ -30,7 +31,8 @@ class ServerCore;
3031
//
3132
// The returned server is in a state of accepting new requests.
3233
std::unique_ptr<net_http::HTTPServerInterface> CreateAndStartHttpServer(
33-
int port, int num_threads, int timeout_in_ms, ServerCore* core);
34+
int port, int num_threads, int timeout_in_ms,
35+
const MonitoringConfig& monitoring_config, ServerCore* core);
3436

3537
} // namespace serving
3638
} // namespace tensorflow

tensorflow_serving/model_servers/main.cc

+5-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,11 @@ int main(int argc, char** argv) {
135135
"Enables model warmup, which triggers lazy "
136136
"initializations (such as TF optimizations) at load "
137137
"time, to reduce first request latency."),
138-
tensorflow::Flag("version", &display_version, "Display version")};
138+
tensorflow::Flag("version", &display_version, "Display version"),
139+
tensorflow::Flag(
140+
"monitoring_config_file", &options.monitoring_config_file,
141+
"If non-empty, read an ascii MonitoringConfig protobuf from "
142+
"the supplied file name")};
139143

140144
const auto& usage = tensorflow::Flags::Usage(argv[0], flag_list);
141145
if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {

tensorflow_serving/model_servers/server.cc

+8-1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ limitations under the License.
3636
#include "tensorflow/core/platform/protobuf.h"
3737
#include "tensorflow/core/protobuf/config.pb.h"
3838
#include "tensorflow_serving/config/model_server_config.pb.h"
39+
#include "tensorflow_serving/config/monitoring_config.pb.h"
3940
#include "tensorflow_serving/config/ssl_config.pb.h"
4041
#include "tensorflow_serving/core/availability_preserving_policy.h"
4142
#include "tensorflow_serving/model_servers/grpc_status_util.h"
@@ -285,9 +286,15 @@ Status Server::BuildAndStart(const Options& server_options) {
285286
if (server_options.http_port != server_options.grpc_port) {
286287
const string server_address =
287288
"localhost:" + std::to_string(server_options.http_port);
289+
MonitoringConfig monitoring_config;
290+
if (!server_options.monitoring_config_file.empty()) {
291+
monitoring_config = ReadProtoFromFile<MonitoringConfig>(
292+
server_options.monitoring_config_file);
293+
}
288294
http_server_ = CreateAndStartHttpServer(
289295
server_options.http_port, server_options.http_num_threads,
290-
server_options.http_timeout_in_ms, server_core_.get());
296+
server_options.http_timeout_in_ms, monitoring_config,
297+
server_core_.get());
291298
if (http_server_ != nullptr) {
292299
LOG(INFO) << "Exporting HTTP/REST API at:" << server_address << " ...";
293300
} else {

tensorflow_serving/model_servers/server.h

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ class Server {
6565
tensorflow::string ssl_config_file;
6666
string model_config_file;
6767
bool enable_model_warmup = true;
68+
tensorflow::string monitoring_config_file;
6869

6970
Options();
7071
};

tensorflow_serving/model_servers/tensorflow_model_server_test.py

+30
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ def GetArgsKey(*args, **kwargs):
117117
def RunServer(model_name,
118118
model_path,
119119
model_config_file=None,
120+
monitoring_config_file=None,
120121
batching_parameters_file=None,
121122
grpc_channel_arguments='',
122123
wait_for_server_ready=True,
@@ -131,6 +132,7 @@ def RunServer(model_name,
131132
model_name: Name of model.
132133
model_path: Path to model.
133134
model_config_file: Path to model config file.
135+
monitoring_config_file: Path to the monitoring config file.
134136
batching_parameters_file: Path to batching parameters.
135137
grpc_channel_arguments: Custom gRPC args for server.
136138
wait_for_server_ready: Wait for gRPC port to be ready.
@@ -165,6 +167,9 @@ def RunServer(model_name,
165167
else:
166168
raise ValueError('Both model_config_file and model_path cannot be empty!')
167169

170+
if monitoring_config_file:
171+
command += ' --monitoring_config_file=' + monitoring_config_file
172+
168173
if batching_parameters_file:
169174
command += ' --enable_batching'
170175
command += ' --batching_parameters_file=' + batching_parameters_file
@@ -287,6 +292,10 @@ def _GetBatchingParametersFile(self):
287292
"""Returns a path to a batching configuration file."""
288293
return os.path.join(self.testdata_dir, 'batching_config.txt')
289294

295+
def _GetMonitoringConfigFile(self):
296+
"""Returns a path to a monitoring configuration file."""
297+
return os.path.join(self.testdata_dir, 'monitoring_config.txt')
298+
290299
def _VerifyModelSpec(self,
291300
actual_model_spec,
292301
exp_model_name,
@@ -642,6 +651,27 @@ def testGetStatusREST(self):
642651
}]
643652
})
644653

654+
def testPrometheusEndpoint(self):
655+
"""Test ModelStatus implementation over REST API with columnar inputs."""
656+
model_path = self._GetSavedModelBundlePath()
657+
host, port = TensorflowModelServerTest.RunServer(
658+
'default',
659+
model_path,
660+
monitoring_config_file=self._GetMonitoringConfigFile())[2].split(':')
661+
662+
# Prepare request
663+
url = 'http://{}:{}/monitoring/prometheus/metrics'.format(host, port)
664+
665+
# Send request
666+
resp_data = None
667+
try:
668+
resp_data = CallREST(url, None)
669+
except Exception as e: # pylint: disable=broad-except
670+
self.fail('Request failed with error: {}'.format(e))
671+
672+
# Verify that there should be some metric type information.
673+
self.assertIn('# TYPE', resp_data)
674+
645675

646676
if __name__ == '__main__':
647677
tf.test.main()

tensorflow_serving/servables/tensorflow/testdata/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,5 @@ exports_files([
8686
"good_model_config.txt",
8787
"bad_model_config.txt",
8888
"batching_config.txt",
89+
"monitoring_config.txt",
8990
])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
prometheus_config: {
2+
enable: true,
3+
path: "/monitoring/prometheus/metrics"
4+
}

tensorflow_serving/util/BUILD

+24
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,18 @@ cc_library(
6262
],
6363
)
6464

65+
cc_library(
66+
name = "prometheus_exporter",
67+
srcs = ["prometheus_exporter.cc"],
68+
hdrs = ["prometheus_exporter.h"],
69+
deps = [
70+
"@com_google_absl//absl/strings",
71+
"@com_google_absl//absl/strings:str_format",
72+
"@com_googlesource_code_re2//:re2",
73+
"@org_tensorflow//tensorflow/core:lib",
74+
],
75+
)
76+
6577
###############################################################################
6678
# Internal targets
6779
###############################################################################
@@ -87,6 +99,18 @@ cc_test(
8799
],
88100
)
89101

102+
cc_test(
103+
name = "prometheus_exporter_test",
104+
size = "small",
105+
srcs = ["prometheus_exporter_test.cc"],
106+
deps = [
107+
":prometheus_exporter",
108+
"//tensorflow_serving/core/test_util:test_main",
109+
"@com_google_absl//absl/strings",
110+
"@org_tensorflow//tensorflow/core:lib",
111+
],
112+
)
113+
90114
cc_test(
91115
name = "event_bus_test",
92116
size = "small",

0 commit comments

Comments
 (0)