Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resilience improvements #246

Merged
merged 20 commits into from
Sep 7, 2018
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Adding various metrics
ewoutp committed Aug 31, 2018
commit 0cc2d5dbba9c78eff90c71af2dc4a9b730d032ac
558 changes: 558 additions & 0 deletions examples/metrics/dashboard.json

Large diffs are not rendered by default.

24 changes: 6 additions & 18 deletions examples/metrics/deployment-operator-servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -1,34 +1,22 @@
# This example shows how to integrate with the Prometheus Operator
# to bring metrics from kube-arangodb to Prometheus.

apiVersion: v1
kind: Service
metadata:
name: arango-deployment-operator
labels:
app: arango-deployment-operator
spec:
selector:
app: arango-deployment-operator
ports:
- name: metrics
port: 8528

---

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: arango-deployment-operator
namespace: monitoring
labels:
team: frontend
prometheus: kube-prometheus
spec:
selector:
matchLabels:
app: arango-deployment-operator
namespaceSelector:
matchNames:
- default
endpoints:
- port: metrics
- port: server
scheme: https
tlsConfig:
insecureSkipVerify: true

66 changes: 27 additions & 39 deletions pkg/deployment/deployment_inspector.go
Original file line number Diff line number Diff line change
@@ -27,12 +27,17 @@ import (
"time"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
"github.com/arangodb/kube-arangodb/pkg/metrics"
"github.com/arangodb/kube-arangodb/pkg/util"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
"github.com/arangodb/kube-arangodb/pkg/util/profiler"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var (
inspectDeploymentDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_deployment_duration", "Amount of time taken by a single inspection of a deployment (in sec)", metrics.DeploymentName)
)

// inspectDeployment inspects the entire deployment, creates
// a plan to update if needed and inspects underlying resources.
// This function should be called when:
@@ -42,13 +47,16 @@ import (
// Returns the delay until this function should be called again.
func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval {
log := d.deps.Log
start := time.Now()

nextInterval := lastInterval
hasError := false
ctx := context.Background()
deploymentName := d.apiObject.GetName()
defer metrics.SetDuration(inspectDeploymentDurationGauges.WithLabelValues(deploymentName), start)

// Check deployment still exists
updated, err := d.deps.DatabaseCRCli.DatabaseV1alpha().ArangoDeployments(d.apiObject.GetNamespace()).Get(d.apiObject.GetName(), metav1.GetOptions{})
updated, err := d.deps.DatabaseCRCli.DatabaseV1alpha().ArangoDeployments(d.apiObject.GetNamespace()).Get(deploymentName, metav1.GetOptions{})
if k8sutil.IsNotFound(err) {
// Deployment is gone
log.Info().Msg("Deployment is gone")
@@ -129,47 +137,27 @@ func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval
}

// Ensure all resources are created
{
ps := profiler.Start()
{
ps := profiler.Start()
if err := d.resources.EnsureSecrets(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Secret creation failed", err, d.apiObject))
}
ps.LogIf(log, time.Millisecond*10, "EnsureSecrets")
}
{
ps := profiler.Start()
if err := d.resources.EnsureServices(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Service creation failed", err, d.apiObject))
}
ps.LogIf(log, time.Millisecond*10, "EnsureServices")
}
if err := d.resources.EnsurePVCs(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("PVC creation failed", err, d.apiObject))
}
{
ps := profiler.Start()
if err := d.resources.EnsurePods(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject))
}
ps.LogIf(log, time.Millisecond*10, "EnsurePods")
}
ps.Done(log, "ensure resources")
if err := d.resources.EnsureSecrets(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Secret creation failed", err, d.apiObject))
}
if err := d.resources.EnsureServices(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Service creation failed", err, d.apiObject))
}
if err := d.resources.EnsurePVCs(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("PVC creation failed", err, d.apiObject))
}
if err := d.resources.EnsurePods(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject))
}

// Create access packages
{
ps := profiler.Start()
if err := d.createAccessPackages(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject))
}
ps.Done(log, "createAccessPackages")
if err := d.createAccessPackages(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject))
}

// Inspect deployment for obsolete members
28 changes: 28 additions & 0 deletions pkg/deployment/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//

package deployment

const (
// Component name for metrics of this package
metricsComponent = "deployment"
)
6 changes: 3 additions & 3 deletions pkg/deployment/resources/deployment_health.go
Original file line number Diff line number Diff line change
@@ -31,7 +31,7 @@ import (
)

var (
fetchDeploymentHealthCounters = metrics.MustRegisterCounterVec("deployment_resources", "fetchDeploymentHealth", "Number of times the health of the deployment was fetched", "deployment", "result")
deploymentHealthFetchesCounters = metrics.MustRegisterCounterVec(metricsComponent, "deployment_health_fetches", "Number of times the health of the deployment was fetched", metrics.DeploymentName, metrics.Result)
)

// RunDeploymentHealthLoop creates a loop to fetch the health of the deployment.
@@ -48,9 +48,9 @@ func (r *Resources) RunDeploymentHealthLoop(stopCh <-chan struct{}) {
for {
if err := r.fetchDeploymentHealth(); err != nil {
log.Debug().Err(err).Msg("Failed to fetch deployment health")
fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "failed").Inc()
deploymentHealthFetchesCounters.WithLabelValues(deploymentName, metrics.Failed).Inc()
} else {
fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "success").Inc()
deploymentHealthFetchesCounters.WithLabelValues(deploymentName, metrics.Success).Inc()
}
select {
case <-time.After(time.Second * 5):
7 changes: 4 additions & 3 deletions pkg/deployment/resources/member_cleanup.go
Original file line number Diff line number Diff line change
@@ -39,19 +39,20 @@ const (
)

var (
cleanupRemovedMembersCounters = metrics.MustRegisterCounterVec("deployment_resources", "cleanupRemovedMembers", "Number of cleanup-removed-members actions", "deployment", "result")
cleanupRemovedMembersCounters = metrics.MustRegisterCounterVec(metricsComponent, "cleanup_removed_members", "Number of cleanup-removed-members actions", metrics.DeploymentName, metrics.Result)
)

// CleanupRemovedMembers removes all arangod members that are no longer part of ArangoDB deployment.
func (r *Resources) CleanupRemovedMembers() error {
// Decide what to do depending on cluster mode
switch r.context.GetSpec().GetMode() {
case api.DeploymentModeCluster:
deploymentName := r.context.GetAPIObject().GetName()
if err := r.cleanupRemovedClusterMembers(); err != nil {
cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "failed").Inc()
cleanupRemovedMembersCounters.WithLabelValues(deploymentName, metrics.Failed).Inc()
return maskAny(err)
}
cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "success").Inc()
cleanupRemovedMembersCounters.WithLabelValues(deploymentName, metrics.Success).Inc()
return nil
default:
// Other mode have no concept of cluster in which members can be removed
28 changes: 28 additions & 0 deletions pkg/deployment/resources/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//

package resources

const (
// Component name for metrics of this package
metricsComponent = "deployment_resources"
)
10 changes: 7 additions & 3 deletions pkg/deployment/resources/pod_inspector.go
Original file line number Diff line number Diff line change
@@ -36,7 +36,8 @@ import (
)

var (
inspectedPodCounter = metrics.MustRegisterCounter("deployment", "inspected_pods", "Number of pod inspections")
inspectedPodsCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_pods", "Number of pod inspections per deployment", metrics.DeploymentName)
inspectPodsDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_pods_duration", "Amount of time taken by a single inspection of all pods for a deployment (in sec)", metrics.DeploymentName)
)

const (
@@ -50,8 +51,12 @@ const (
// Returns: Interval_till_next_inspection, error
func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
log := r.log
start := time.Now()
apiObject := r.context.GetAPIObject()
deploymentName := apiObject.GetName()
var events []*k8sutil.Event
nextInterval := maxPodInspectorInterval // Large by default, will be made smaller if needed in the rest of the function
defer metrics.SetDuration(inspectPodsDurationGauges.WithLabelValues(deploymentName), start)

pods, err := r.context.GetOwnedPods()
if err != nil {
@@ -61,7 +66,6 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {

// Update member status from all pods found
status, lastVersion := r.context.GetStatus()
apiObject := r.context.GetAPIObject()
var podNamesWithScheduleTimeout []string
var unscheduledPodNames []string
for _, p := range pods {
@@ -71,7 +75,7 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) {
}

// Pod belongs to this deployment, update metric
inspectedPodCounter.Inc()
inspectedPodsCounters.WithLabelValues(deploymentName).Inc()

// Find member status
memberStatus, group, found := status.Members.MemberStatusByPodName(p.GetName())
11 changes: 9 additions & 2 deletions pkg/deployment/resources/pvc_inspector.go
Original file line number Diff line number Diff line change
@@ -32,15 +32,22 @@ import (
)

var (
inspectedPVCCounter = metrics.MustRegisterCounter("deployment", "inspected_ppvcs", "Number of PVCs inspections")
inspectedPVCsCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_pvcs", "Number of PVC inspections per deployment", metrics.DeploymentName)
inspectPVCsDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_pvcs_duration", "Amount of time taken by a single inspection of all PVCs for a deployment (in sec)", metrics.DeploymentName)
)

const (
maxPVCInspectorInterval = util.Interval(time.Hour) // Maximum time between PVC inspection (if nothing else happens)
)

// InspectPVCs lists all PVCs that belong to the given deployment and updates
// the member status of the deployment accordingly.
func (r *Resources) InspectPVCs(ctx context.Context) (util.Interval, error) {
log := r.log
start := time.Now()
nextInterval := maxPVCInspectorInterval
deploymentName := r.context.GetAPIObject().GetName()
defer metrics.SetDuration(inspectPVCsDurationGauges.WithLabelValues(deploymentName), start)

pvcs, err := r.context.GetOwnedPVCs()
if err != nil {
@@ -52,7 +59,7 @@ func (r *Resources) InspectPVCs(ctx context.Context) (util.Interval, error) {
status, _ := r.context.GetStatus()
for _, p := range pvcs {
// PVC belongs to this deployment, update metric
inspectedPVCCounter.Inc()
inspectedPVCsCounters.WithLabelValues(deploymentName).Inc()

// Find member status
memberStatus, group, found := status.Members.MemberStatusByPVCName(p.GetName())
18 changes: 18 additions & 0 deletions pkg/deployment/resources/secrets.go
Original file line number Diff line number Diff line change
@@ -25,39 +25,57 @@ package resources
import (
"crypto/rand"
"encoding/hex"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
"github.com/arangodb/kube-arangodb/pkg/metrics"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)

var (
inspectedSecretsCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_secrets", "Number of Secret inspections per deployment", metrics.DeploymentName)
inspectSecretsDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_secrets_duration", "Amount of time taken by a single inspection of all Secrets for a deployment (in sec)", metrics.DeploymentName)
)

// EnsureSecrets creates all secrets needed to run the given deployment
func (r *Resources) EnsureSecrets() error {
start := time.Now()
kubecli := r.context.GetKubeCli()
ns := r.context.GetNamespace()
secrets := k8sutil.NewSecretCache(kubecli.CoreV1().Secrets(ns))
spec := r.context.GetSpec()
deploymentName := r.context.GetAPIObject().GetName()
defer metrics.SetDuration(inspectSecretsDurationGauges.WithLabelValues(deploymentName), start)
counterMetric := inspectedSecretsCounters.WithLabelValues(deploymentName)

if spec.IsAuthenticated() {
counterMetric.Inc()
if err := r.ensureTokenSecret(secrets, spec.Authentication.GetJWTSecretName()); err != nil {
return maskAny(err)
}
}
if spec.IsSecure() {
counterMetric.Inc()
if err := r.ensureTLSCACertificateSecret(secrets, spec.TLS); err != nil {
return maskAny(err)
}
}
if spec.Sync.IsEnabled() {
counterMetric.Inc()
if err := r.ensureTokenSecret(secrets, spec.Sync.Authentication.GetJWTSecretName()); err != nil {
return maskAny(err)
}
counterMetric.Inc()
if err := r.ensureTokenSecret(secrets, spec.Sync.Monitoring.GetTokenSecretName()); err != nil {
return maskAny(err)
}
counterMetric.Inc()
if err := r.ensureTLSCACertificateSecret(secrets, spec.Sync.TLS); err != nil {
return maskAny(err)
}
counterMetric.Inc()
if err := r.ensureClientAuthCACertificateSecret(secrets, spec.Sync.Authentication); err != nil {
return maskAny(err)
}
21 changes: 16 additions & 5 deletions pkg/deployment/resources/services.go
Original file line number Diff line number Diff line change
@@ -25,29 +25,38 @@ package resources
import (
"time"

"k8s.io/client-go/kubernetes"

"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
"github.com/arangodb/kube-arangodb/pkg/metrics"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
"github.com/prometheus/client_golang/prometheus"
"github.com/rs/zerolog"
)

var (
inspectedServicesCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_services", "Number of Service inspections per deployment", metrics.DeploymentName)
inspectServicesDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_services_duration", "Amount of time taken by a single inspection of all Services for a deployment (in sec)", metrics.DeploymentName)
)

// EnsureServices creates all services needed to service the deployment
func (r *Resources) EnsureServices() error {
log := r.log
start := time.Now()
kubecli := r.context.GetKubeCli()
apiObject := r.context.GetAPIObject()
deploymentName := apiObject.GetName()
ns := apiObject.GetNamespace()
owner := apiObject.AsOwner()
spec := r.context.GetSpec()
defer metrics.SetDuration(inspectServicesDurationGauges.WithLabelValues(deploymentName), start)
counterMetric := inspectedServicesCounters.WithLabelValues(deploymentName)

// Fetch existing services
svcs := k8sutil.NewServiceCache(kubecli.CoreV1().Services(ns))
// Headless service
counterMetric.Inc()
if _, err := svcs.Get(k8sutil.CreateHeadlessServiceName(deploymentName), metav1.GetOptions{}); err != nil {
svcName, newlyCreated, err := k8sutil.CreateHeadlessService(svcs, apiObject, owner)
if err != nil {
@@ -61,6 +70,7 @@ func (r *Resources) EnsureServices() error {

// Internal database client service
single := spec.GetMode().HasSingleServers()
counterMetric.Inc()
if _, err := svcs.Get(k8sutil.CreateDatabaseClientServiceName(deploymentName), metav1.GetOptions{}); err != nil {
svcName, newlyCreated, err := k8sutil.CreateDatabaseClientService(svcs, apiObject, single, owner)
if err != nil {
@@ -87,15 +97,16 @@ func (r *Resources) EnsureServices() error {
if single {
role = "single"
}
if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "database", k8sutil.ArangoPort, false, spec.ExternalAccess, apiObject, log, kubecli); err != nil {
if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "database", k8sutil.ArangoPort, false, spec.ExternalAccess, apiObject, log, counterMetric); err != nil {
return maskAny(err)
}

if spec.Sync.IsEnabled() {
// External (and internal) Sync master service
counterMetric.Inc()
eaServiceName := k8sutil.CreateSyncMasterClientServiceName(deploymentName)
role := "syncmaster"
if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "sync", k8sutil.ArangoSyncMasterPort, true, spec.Sync.ExternalAccess.ExternalAccessSpec, apiObject, log, kubecli); err != nil {
if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "sync", k8sutil.ArangoSyncMasterPort, true, spec.Sync.ExternalAccess.ExternalAccessSpec, apiObject, log, counterMetric); err != nil {
return maskAny(err)
}
status, lastVersion := r.context.GetStatus()
@@ -110,7 +121,7 @@ func (r *Resources) EnsureServices() error {
}

// EnsureServices creates all services needed to service the deployment
func (r *Resources) ensureExternalAccessServices(svcs k8sutil.ServiceInterface, eaServiceName, ns, svcRole, title string, port int, noneIsClusterIP bool, spec api.ExternalAccessSpec, apiObject k8sutil.APIObject, log zerolog.Logger, kubecli kubernetes.Interface) error {
func (r *Resources) ensureExternalAccessServices(svcs k8sutil.ServiceInterface, eaServiceName, ns, svcRole, title string, port int, noneIsClusterIP bool, spec api.ExternalAccessSpec, apiObject k8sutil.APIObject, log zerolog.Logger, counterMetric prometheus.Counter) error {
// Database external access service
createExternalAccessService := false
deleteExternalAccessService := false
21 changes: 20 additions & 1 deletion pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -22,10 +22,23 @@

package metrics

import "github.com/prometheus/client_golang/prometheus"
import (
"time"

"github.com/prometheus/client_golang/prometheus"
)

const (
namespace = "arangodb_operator"

// DeploymentName is a label key used for the name of a deployment
DeploymentName = "deployment"
// Result is a label key used for the result of an action (Success|Failed)
Result = "result"
// Success is a label value used for successful actions
Success = "success"
// Failed is a label value used for failed actions
Failed = "failed"
)

// MustRegisterCounter creates and registers a counter.
@@ -96,3 +109,9 @@ func MustRegisterSummary(component, name, help string, objectives map[float64]fl
prometheus.MustRegister(m)
return m
}

// SetDuration sets a gauge value for the duration since the given start time
// in seconds.
func SetDuration(g prometheus.Gauge, startTime time.Time) {
g.Set(time.Since(startTime).Seconds())
}