Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add pod initialization logic in diag and follow up some minor reporting changes. #4690

Merged
merged 13 commits into from
Aug 28, 2020
30 changes: 22 additions & 8 deletions pkg/diag/validator/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ var (
proto.StatusCode_STATUSCHECK_CONTAINER_WAITING_UNKNOWN: {},
proto.StatusCode_STATUSCHECK_UNKNOWN_UNSCHEDULABLE: {},
proto.StatusCode_STATUSCHECK_SUCCESS: {},
proto.StatusCode_STATUSCHECK_POD_INITIALIZING: {},
}
)

Expand Down Expand Up @@ -131,7 +132,18 @@ func getPodStatus(pod *v1.Pod) (proto.StatusCode, []string, error) {
// TODO(dgageot): Add EphemeralContainerStatuses
cs := append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...)
// See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-states
return getContainerStatus(pod, cs)
statusCode, logs, err := getContainerStatus(pod, cs)
if statusCode == proto.StatusCode_STATUSCHECK_POD_INITIALIZING {
// Determine if an init container is still running and fetch the init logs.
for _, c := range pod.Status.InitContainerStatuses {
if c.State.Waiting != nil {
return statusCode, []string{}, fmt.Errorf("waiting for init container %s to start", c.Name)
} else if c.State.Running != nil {
return statusCode, getPodLogs(pod, c.Name), fmt.Errorf("waiting for init container %s to complete", c.Name)
}
}
}
return statusCode, logs, err
case v1.ConditionUnknown:
logrus.Debugf("Pod %q scheduling condition is unknown", pod.Name)
return proto.StatusCode_STATUSCHECK_UNKNOWN, nil, fmt.Errorf(c.Message)
Expand Down Expand Up @@ -288,7 +300,8 @@ func extractErrorMessageFromWaitingContainerStatus(po *v1.Pod, c v1.ContainerSta
// Extract meaning full error out of container statuses.
switch c.State.Waiting.Reason {
case podInitializing:
// container is waiting to run
// container is waiting to run. This could be because one of the init containers is
// still not completed
return proto.StatusCode_STATUSCHECK_POD_INITIALIZING, nil, nil
case containerCreating:
return proto.StatusCode_STATUSCHECK_CONTAINER_CREATING, nil, fmt.Errorf("creating container %s", c.Name)
Expand Down Expand Up @@ -330,13 +343,14 @@ func getPodLogs(po *v1.Pod, c string) []string {
if err != nil {
return []string{fmt.Sprintf("Error retrieving logs for pod %s. Try `%s`", po.Name, strings.Join(logCommand, " "))}
}
lines := strings.Split(string(logs), "\n")
output := strings.Split(string(logs), "\n")
// remove spurious empty lines (empty string or from trailing newline)
if len(lines) > 0 && len(lines[len(lines)-1]) == 0 {
lines = lines[:len(lines)-1]
}
for i, s := range lines {
lines[i] = fmt.Sprintf("[%s %s] %s", po.Name, c, s)
lines := make([]string, 0, len(output))
for _, s := range output {
if s == "" {
continue
}
lines = append(lines, fmt.Sprintf("[%s %s] %s", po.Name, c, s))
}
return lines
}
Expand Down
39 changes: 39 additions & 0 deletions pkg/diag/validator/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,45 @@ func TestRun(t *testing.T) {
ErrCode: proto.StatusCode_STATUSCHECK_CONTAINER_RESTARTING,
}, []string{"[foo foo-container] some panic"})},
},
{
description: "pod condition with events when pod is in Initializing phase",
pods: []*v1.Pod{{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
Namespace: "test",
},
TypeMeta: metav1.TypeMeta{Kind: "Pod"},
Status: v1.PodStatus{
Phase: v1.PodPending,
Conditions: []v1.PodCondition{{
Type: v1.PodScheduled,
Status: v1.ConditionTrue,
}},
ContainerStatuses: []v1.ContainerStatus{
{
Name: "foo-container",
Image: "foo-image",
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "PodInitializing",
Message: "waiting to initialize",
},
},
},
},
},
}},
events: []v1.Event{
{
ObjectMeta: metav1.ObjectMeta{Namespace: "test"},
Reason: "eventCode", Type: "Warning", Message: "dummy event",
},
},
expected: []Resource{NewResource("test", "Pod", "foo", "Pending",
proto.ActionableErr{
Message: "eventCode: dummy event",
ErrCode: proto.StatusCode_STATUSCHECK_UNKNOWN_EVENT,
}, nil)},
},
}

for _, test := range tests {
Expand Down
60 changes: 42 additions & 18 deletions pkg/skaffold/deploy/resource/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const (
defaultPodCheckDeadline = 30 * time.Second
tabHeader = " -"
tab = " "
maxLogLines = 3
)

var (
Expand All @@ -59,7 +60,7 @@ type Deployment struct {
namespace string
rType string
status Status
StatusCode proto.StatusCode
statusCode proto.StatusCode
done bool
deadline time.Duration
pods map[string]validator.Resource
Expand All @@ -77,6 +78,7 @@ func (d *Deployment) UpdateStatus(ae proto.ActionableErr) {
return
}
d.status = updated
d.statusCode = updated.ActionableError().ErrCode
d.status.changed = true
if ae.ErrCode == proto.StatusCode_STATUSCHECK_SUCCESS || isErrAndNotRetryAble(ae.ErrCode) {
d.done = true
Expand Down Expand Up @@ -136,19 +138,28 @@ func (d *Deployment) Status() Status {
return d.status
}

func (d *Deployment) IsStatusCheckComplete() bool {
return d.done
func (d *Deployment) IsStatusCheckCompleteOrCancelled() bool {
return d.done || d.statusCode == proto.StatusCode_STATUSCHECK_CONTEXT_CANCELLED
}

func (d *Deployment) StatusMessage() string {
for _, p := range d.pods {
if s := p.ActionableError(); s.ErrCode != proto.StatusCode_STATUSCHECK_SUCCESS {
return fmt.Sprintf("%s\n", s.Message)
}
}
return d.status.String()
}

func (d *Deployment) MarkComplete() {
d.done = true
}

// This returns a string representing deployment status along with tab header
// ReportSinceLastUpdated returns a string representing deployment status along with tab header
// e.g.
// - testNs:deployment/leeroy-app: waiting for rollout to complete. (1/2) pending
// - testNs:pod/leeroy-app-xvbg : error pulling container image
func (d *Deployment) ReportSinceLastUpdated() string {
func (d *Deployment) ReportSinceLastUpdated(isMuted bool) string {
if d.status.reported && !d.status.changed {
return ""
}
Expand All @@ -165,15 +176,24 @@ func (d *Deployment) ReportSinceLastUpdated() string {
for _, p := range d.pods {
if s := p.ActionableError().Message; s != "" {
result.WriteString(fmt.Sprintf("%s %s %s: %s\n", tab, tabHeader, p, s))
for _, l := range p.Logs() {
result.WriteString(fmt.Sprintf("%s\n", l))
// if logs are muted, write container logs to file and last 3 lines to
// result.
out, writeTrimLines, err := withLogFile(p.Name(), &result, p.Logs(), isMuted)
if err != nil {
logrus.Debugf("could not create log file %v", err)
}
trimLines := []string{}
for i, l := range p.Logs() {
formattedLine := fmt.Sprintf("%s %s > %s\n", tab, tab, strings.TrimSuffix(l, "\n"))
if isMuted && i >= len(p.Logs())-maxLogLines {
trimLines = append(trimLines, formattedLine)
}
out.Write([]byte(formattedLine))
}
writeTrimLines(trimLines)
}
}
if result.String() == "" {
return ""
}
return fmt.Sprintf("%s %s: %s%s", tabHeader, d, d.status, result.String())
return fmt.Sprintf("%s %s: %s%s", tabHeader, d, d.StatusMessage(), result.String())
}

func (d *Deployment) cleanupStatus(msg string) string {
Expand Down Expand Up @@ -253,7 +273,8 @@ func (d *Deployment) fetchPods(ctx context.Context) error {
if !found || originalPod.StatusUpdated(p) {
d.status.changed = true
switch p.ActionableError().ErrCode {
case proto.StatusCode_STATUSCHECK_CONTAINER_CREATING:
case proto.StatusCode_STATUSCHECK_CONTAINER_CREATING,
proto.StatusCode_STATUSCHECK_POD_INITIALIZING:
event.ResourceStatusCheckEventUpdated(p.String(), p.ActionableError())
default:
event.ResourceStatusCheckEventCompleted(p.String(), p.ActionableError())
Expand All @@ -265,18 +286,21 @@ func (d *Deployment) fetchPods(ctx context.Context) error {
return nil
}

// Return first pod status in error.
// TODO: should we return all distinct error codes in future?
func (d *Deployment) FirstPodErrOccurred() proto.StatusCode {
if len(d.pods) == 0 {
return d.Status().ActionableError().ErrCode
// StatusCode() returns the deployment status code if the status check is cancelled
// or if no pod data exists for this deployment.
// If pods are fetched, this function returns the error code a pod container encountered.
func (d *Deployment) StatusCode() proto.StatusCode {
// do not process pod status codes if another deployment failed
// or the user aborted the run.
if d.statusCode == proto.StatusCode_STATUSCHECK_CONTEXT_CANCELLED {
return d.statusCode
}
for _, p := range d.pods {
if s := p.ActionableError().ErrCode; s != proto.StatusCode_STATUSCHECK_SUCCESS {
return s
}
}
return proto.StatusCode_STATUSCHECK_SUCCESS
return d.statusCode
}

func (d *Deployment) WithPodStatuses(scs []proto.StatusCode) *Deployment {
Expand Down
Loading