Prometheus exporter for TF metrics.

tensorflower-gardener · tensorflower-gardener · commit 021efbd3281a · 2018-09-12T13:40:05.000-07:00
Metrics are exported via HTTP server on a new endpoint
(defaults to `/monitoring/prometheus/metrics`) that
Prometheus can use to scrape from.

This change adds:
o `MonitoringConfig` proto to configure monitoring.
o `--monitoring_config_file` command line flag to pass config.

PiperOrigin-RevId: 212695523
diff --git a/tensorflow_serving/config/BUILD b/tensorflow_serving/config/BUILD
@@ -88,6 +88,15 @@ serving_proto_library_py(
     ],
 )
 
+serving_proto_library(
+    name = "monitoring_config_proto",
+    srcs = ["monitoring_config.proto"],
+    cc_api_version = 2,
+    java_api_version = 2,
+    deps = [
+    ],
+)
+
 serving_proto_library(
     name = "ssl_config_proto",
     srcs = ["ssl_config.proto"],
diff --git a/tensorflow_serving/config/monitoring_config.proto b/tensorflow_serving/config/monitoring_config.proto
@@ -0,0 +1,19 @@
+syntax = "proto3";
+
+package tensorflow.serving;
+option cc_enable_arenas = true;
+
+// Configuration for Prometheus monitoring.
+message PrometheusConfig {
+  // Whether to expose Prometheus metrics.
+  bool enable = 1;
+
+  // The endpoint to expose Prometheus metrics.
+  // If not specified, PrometheusExporter::kPrometheusPath value is used.
+  string path = 2;
+}
+
+// Configuration for monitoring.
+message MonitoringConfig {
+  PrometheusConfig prometheus_config = 1;
+}
diff --git a/tensorflow_serving/model_servers/BUILD b/tensorflow_serving/model_servers/BUILD
@@ -243,10 +243,11 @@ cc_library(
     deps = [
         ":http_rest_api_handler",
         ":server_core",
+        "//tensorflow_serving/config:monitoring_config_proto",
+        "//tensorflow_serving/util:prometheus_exporter",
         "//tensorflow_serving/util:threadpool_executor",
         "//tensorflow_serving/util/net_http/server/public:http_server",
         "//tensorflow_serving/util/net_http/server/public:http_server_api",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
         "@org_tensorflow//tensorflow/core:lib",
@@ -318,6 +319,7 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@org_tensorflow//tensorflow/core:protos_all_cc",
         "//tensorflow_serving/config:model_server_config_proto",
+        "//tensorflow_serving/config:monitoring_config_proto",
         "//tensorflow_serving/config:ssl_config_proto",
         "//tensorflow_serving/core:availability_preserving_policy",
         "//tensorflow_serving/servables/tensorflow:session_bundle_config_proto",
@@ -366,6 +368,7 @@ py_test(
         "//tensorflow_serving/servables/tensorflow/testdata:half_plus_two/00000123/export.data-00000-of-00001",
         "//tensorflow_serving/servables/tensorflow/testdata:half_plus_two/00000123/export.index",
         "//tensorflow_serving/servables/tensorflow/testdata:half_plus_two/00000123/export.meta",
+        "//tensorflow_serving/servables/tensorflow/testdata:monitoring_config.txt",
         "//tensorflow_serving/servables/tensorflow/testdata:saved_model_half_plus_three/00000123/assets/foo.txt",
         "//tensorflow_serving/servables/tensorflow/testdata:saved_model_half_plus_three/00000123/saved_model.pb",
         "//tensorflow_serving/servables/tensorflow/testdata:saved_model_half_plus_three/00000123/variables/variables.data-00000-of-00001",
diff --git a/tensorflow_serving/model_servers/http_server.cc b/tensorflow_serving/model_servers/http_server.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <memory>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "re2/re2.h"
 #include "tensorflow/core/platform/env.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #include "tensorflow_serving/util/net_http/server/public/httpserver.h"
 #include "tensorflow_serving/util/net_http/server/public/response_code_enum.h"
 #include "tensorflow_serving/util/net_http/server/public/server_request_interface.h"
+#include "tensorflow_serving/util/prometheus_exporter.h"
 #include "tensorflow_serving/util/threadpool_executor.h"
 
 namespace tensorflow {
@@ -79,6 +82,35 @@ net_http::HTTPStatusCode ToHTTPStatusCode(const Status& status) {
   }
 }
 
+void ProcessPrometheusRequest(PrometheusExporter* exporter,
+                              const PrometheusConfig& prometheus_config,
+                              net_http::ServerRequestInterface* req) {
+  std::vector<std::pair<string, string>> headers;
+  headers.push_back({"Content-Type", "text/plain"});
+  string output;
+  Status status;
+  // Check if url matches the path.
+  if (req->uri_path() != prometheus_config.path()) {
+    output = absl::StrFormat("Unexpected path: %s. Should be %s",
+                             req->uri_path(), prometheus_config.path());
+    status = Status(error::Code::INVALID_ARGUMENT, output);
+  } else {
+    status = exporter->GeneratePage(&output);
+  }
+  const net_http::HTTPStatusCode http_status = ToHTTPStatusCode(status);
+  // Note: we add headers+output for non successful status too, in case the
+  // output contains details about the error (e.g. error messages).
+  for (const auto& kv : headers) {
+    req->OverwriteResponseHeader(kv.first, kv.second);
+  }
+  req->WriteResponseString(output);
+  if (http_status != net_http::HTTPStatusCode::OK) {
+    VLOG(1) << "Error Processing prometheus metrics request. Error: "
+            << status.ToString();
+  }
+  req->ReplyWithStatus(http_status);
+}
+
 class RequestExecutor final : public net_http::EventExecutor {
  public:
   explicit RequestExecutor(int num_threads)
@@ -147,7 +179,8 @@ class RestApiRequestDispatcher {
 }  // namespace
 
 std::unique_ptr<net_http::HTTPServerInterface> CreateAndStartHttpServer(
-    int port, int num_threads, int timeout_in_ms, ServerCore* core) {
+    int port, int num_threads, int timeout_in_ms,
+    const MonitoringConfig& monitoring_config, ServerCore* core) {
   auto options = absl::make_unique<net_http::ServerOptions>();
   options->AddPort(static_cast<uint32_t>(port));
   options->SetExecutor(absl::make_unique<RequestExecutor>(num_threads));
@@ -157,6 +190,20 @@ std::unique_ptr<net_http::HTTPServerInterface> CreateAndStartHttpServer(
     return nullptr;
   }
 
+  // Register handler for prometheus metric endpoint.
+  if (monitoring_config.prometheus_config().enable()) {
+    std::shared_ptr<PrometheusExporter> exporter =
+        std::make_shared<PrometheusExporter>();
+    net_http::RequestHandlerOptions prometheus_request_options;
+    PrometheusConfig prometheus_config = monitoring_config.prometheus_config();
+    server->RegisterRequestHandler(
+        monitoring_config.prometheus_config().path(),
+        [exporter, prometheus_config](net_http::ServerRequestInterface* req) {
+          ProcessPrometheusRequest(exporter.get(), prometheus_config, req);
+        },
+        prometheus_request_options);
+  }
+
   std::shared_ptr<RestApiRequestDispatcher> dispatcher =
       std::make_shared<RestApiRequestDispatcher>(timeout_in_ms, core);
   net_http::RequestHandlerOptions handler_options;
diff --git a/tensorflow_serving/model_servers/http_server.h b/tensorflow_serving/model_servers/http_server.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow_serving/config/monitoring_config.pb.h"
 #include "tensorflow_serving/util/net_http/server/public/httpserver_interface.h"
 
 namespace tensorflow {
@@ -30,7 +31,8 @@ class ServerCore;
 //
 // The returned server is in a state of accepting new requests.
 std::unique_ptr<net_http::HTTPServerInterface> CreateAndStartHttpServer(
-    int port, int num_threads, int timeout_in_ms, ServerCore* core);
+    int port, int num_threads, int timeout_in_ms,
+    const MonitoringConfig& monitoring_config, ServerCore* core);
 
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow_serving/model_servers/main.cc b/tensorflow_serving/model_servers/main.cc
@@ -135,7 +135,11 @@ int main(int argc, char** argv) {
                        "Enables model warmup, which triggers lazy "
                        "initializations (such as TF optimizations) at load "
                        "time, to reduce first request latency."),
-      tensorflow::Flag("version", &display_version, "Display version")};
+      tensorflow::Flag("version", &display_version, "Display version"),
+      tensorflow::Flag(
+          "monitoring_config_file", &options.monitoring_config_file,
+          "If non-empty, read an ascii MonitoringConfig protobuf from "
+          "the supplied file name")};
 
   const auto& usage = tensorflow::Flags::Usage(argv[0], flag_list);
   if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
diff --git a/tensorflow_serving/model_servers/server.cc b/tensorflow_serving/model_servers/server.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow_serving/config/model_server_config.pb.h"
+#include "tensorflow_serving/config/monitoring_config.pb.h"
 #include "tensorflow_serving/config/ssl_config.pb.h"
 #include "tensorflow_serving/core/availability_preserving_policy.h"
 #include "tensorflow_serving/model_servers/grpc_status_util.h"
@@ -285,9 +286,15 @@ Status Server::BuildAndStart(const Options& server_options) {
     if (server_options.http_port != server_options.grpc_port) {
       const string server_address =
           "localhost:" + std::to_string(server_options.http_port);
+      MonitoringConfig monitoring_config;
+      if (!server_options.monitoring_config_file.empty()) {
+        monitoring_config = ReadProtoFromFile<MonitoringConfig>(
+            server_options.monitoring_config_file);
+      }
       http_server_ = CreateAndStartHttpServer(
           server_options.http_port, server_options.http_num_threads,
-          server_options.http_timeout_in_ms, server_core_.get());
+          server_options.http_timeout_in_ms, monitoring_config,
+          server_core_.get());
       if (http_server_ != nullptr) {
         LOG(INFO) << "Exporting HTTP/REST API at:" << server_address << " ...";
       } else {
diff --git a/tensorflow_serving/model_servers/server.h b/tensorflow_serving/model_servers/server.h
@@ -65,6 +65,7 @@ class Server {
     tensorflow::string ssl_config_file;
     string model_config_file;
     bool enable_model_warmup = true;
+    tensorflow::string monitoring_config_file;
 
     Options();
   };
diff --git a/tensorflow_serving/model_servers/tensorflow_model_server_test.py b/tensorflow_serving/model_servers/tensorflow_model_server_test.py
@@ -117,6 +117,7 @@ def GetArgsKey(*args, **kwargs):
   def RunServer(model_name,
                 model_path,
                 model_config_file=None,
+                monitoring_config_file=None,
                 batching_parameters_file=None,
                 grpc_channel_arguments='',
                 wait_for_server_ready=True,
@@ -131,6 +132,7 @@ def RunServer(model_name,
       model_name: Name of model.
       model_path: Path to model.
       model_config_file: Path to model config file.
+      monitoring_config_file: Path to the monitoring config file.
       batching_parameters_file: Path to batching parameters.
       grpc_channel_arguments: Custom gRPC args for server.
       wait_for_server_ready: Wait for gRPC port to be ready.
@@ -165,6 +167,9 @@ def RunServer(model_name,
     else:
       raise ValueError('Both model_config_file and model_path cannot be empty!')
 
+    if monitoring_config_file:
+      command += ' --monitoring_config_file=' + monitoring_config_file
+
     if batching_parameters_file:
       command += ' --enable_batching'
       command += ' --batching_parameters_file=' + batching_parameters_file
@@ -287,6 +292,10 @@ def _GetBatchingParametersFile(self):
     """Returns a path to a batching configuration file."""
     return os.path.join(self.testdata_dir, 'batching_config.txt')
 
+  def _GetMonitoringConfigFile(self):
+    """Returns a path to a monitoring configuration file."""
+    return os.path.join(self.testdata_dir, 'monitoring_config.txt')
+
   def _VerifyModelSpec(self,
                        actual_model_spec,
                        exp_model_name,
@@ -642,6 +651,27 @@ def testGetStatusREST(self):
             }]
         })
 
+  def testPrometheusEndpoint(self):
+    """Test ModelStatus implementation over REST API with columnar inputs."""
+    model_path = self._GetSavedModelBundlePath()
+    host, port = TensorflowModelServerTest.RunServer(
+        'default',
+        model_path,
+        monitoring_config_file=self._GetMonitoringConfigFile())[2].split(':')
+
+    # Prepare request
+    url = 'http://{}:{}/monitoring/prometheus/metrics'.format(host, port)
+
+    # Send request
+    resp_data = None
+    try:
+      resp_data = CallREST(url, None)
+    except Exception as e:  # pylint: disable=broad-except
+      self.fail('Request failed with error: {}'.format(e))
+
+    # Verify that there should be some metric type information.
+    self.assertIn('# TYPE', resp_data)
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow_serving/servables/tensorflow/testdata/BUILD b/tensorflow_serving/servables/tensorflow/testdata/BUILD
@@ -86,4 +86,5 @@ exports_files([
     "good_model_config.txt",
     "bad_model_config.txt",
     "batching_config.txt",
+    "monitoring_config.txt",
 ])
diff --git a/tensorflow_serving/servables/tensorflow/testdata/monitoring_config.txt b/tensorflow_serving/servables/tensorflow/testdata/monitoring_config.txt
@@ -0,0 +1,4 @@
+prometheus_config: {
+  enable: true,
+  path: "/monitoring/prometheus/metrics"
+}
diff --git a/tensorflow_serving/util/BUILD b/tensorflow_serving/util/BUILD
@@ -62,6 +62,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "prometheus_exporter",
+    srcs = ["prometheus_exporter.cc"],
+    hdrs = ["prometheus_exporter.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_googlesource_code_re2//:re2",
+        "@org_tensorflow//tensorflow/core:lib",
+    ],
+)
+
 ###############################################################################
 #                  Internal targets
 ###############################################################################
@@ -87,6 +99,18 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "prometheus_exporter_test",
+    size = "small",
+    srcs = ["prometheus_exporter_test.cc"],
+    deps = [
+        ":prometheus_exporter",
+        "//tensorflow_serving/core/test_util:test_main",
+        "@com_google_absl//absl/strings",
+        "@org_tensorflow//tensorflow/core:lib",
+    ],
+)
+
 cc_test(
     name = "event_bus_test",
     size = "small",
diff --git a/tensorflow_serving/util/prometheus_exporter.cc b/tensorflow_serving/util/prometheus_exporter.cc
diff --git a/tensorflow_serving/util/prometheus_exporter.h b/tensorflow_serving/util/prometheus_exporter.h
diff --git a/tensorflow_serving/util/prometheus_exporter_test.cc b/tensorflow_serving/util/prometheus_exporter_test.cc