test-cluster-master-{error,kill} fail only in weird CI environment #6193

drewfish · 2016-04-14T03:17:56Z

Version: 4.4.3
Platform: some rhel 6
Subsystem: cluster

We build 4.x for use internally in the company, using a Jenkins job which creates a docker image in which the actual build and tests are run. Very oddly, test-cluster-master-error and test-cluster-master-kill fail every build. I've not been able to reproduce this in any other environment, even running the same docker image on my macbook.

Here is the output of the two failing tests:

00:30:43.898 not ok 97 test-cluster-master-error.js
00:30:43.898 # 
00:30:43.898 # assert.js:90
00:30:43.898 #   throw new assert.AssertionError({
00:30:43.899 #   ^
00:30:43.899 # AssertionError: The workers did not die after an error in the master
00:30:43.899 #     at process.<anonymous> (/var/builds/workspace/105338-v3-component/BUILD_CONTAINER/rhel6/label/DOCKER-LOW/app_root/test/parallel/test-cluster-master-error.js:152:12)
00:30:43.899 #     at process.g (events.js:260:16)
00:30:43.900 #     at emitOne (events.js:82:20)
00:30:43.900 #     at process.emit (events.js:169:7)
00:30:43.900   ---
00:30:43.900   duration_ms: 0.710
00:30:43.900   ...
00:30:45.613 not ok 98 test-cluster-master-kill.js
00:30:45.613 # 
00:30:45.613 # assert.js:90
00:30:45.613 #   throw new assert.AssertionError({
00:30:45.614 #   ^
00:30:45.614 # AssertionError: worker was alive after master died
00:30:45.614 #     at process.<anonymous> (/var/builds/workspace/105338-v3-component/BUILD_CONTAINER/rhel6/label/DOCKER-LOW/app_root/test/parallel/test-cluster-master-kill.js:79:12)
00:30:45.614 #     at process.g (events.js:260:16)
00:30:45.614 #     at emitOne (events.js:82:20)
00:30:45.614 #     at process.emit (events.js:169:7)
00:30:45.615   ---
00:30:45.615   duration_ms: 1.712
00:30:45.615   ...

The tests were run via:

export NODE_COMMON_PORT=16733
export NODE_TEST_DIR=/tmp/screwdriver
scl enable devtoolset-4 python27 -- gmake CFLAGS=-m64 LDFLAGS=-Wl,-rpath,/home/y/lib64 test-ci

The text was updated successfully, but these errors were encountered:

drewfish · 2016-04-14T03:30:39Z

Given that I can only reproduce this in our CI build, I've started checking in experimental changes just to see how they fair.

Here's an experiment which just adds logging:

diff --git a/test/parallel/test-cluster-master-error.js b/test/parallel/test-cluster-master-error.js
index ae0f655..16d63c1 100644
--- a/test/parallel/test-cluster-master-error.js
+++ b/test/parallel/test-cluster-master-error.js
@@ -2,21 +2,39 @@
 var common = require('../common');
 var assert = require('assert');
 var cluster = require('cluster');
+var fs = require('fs');
+
+// really wish we had `...values` syntax
+function log(strings) {
+  var line = ['TEST-CLUSTER-MASTER-ERROR ', process.pid, ' '];
+  for (let i = 0; i < strings.length; i++) {
+    line.push(strings[i]);
+    if (i+1 !== strings.length) {
+      line.push(arguments[i+1]);
+    }
+  }
+  fs.appendFileSync(__dirname + '/../../artifacts/folta.log', line.join('') + '\n');
+}

 // Cluster setup
 if (cluster.isWorker) {
+  log`WORKER started`;
   var http = require('http');
   http.Server(function() {

-  }).listen(common.PORT, '127.0.0.1');
+  }).listen(common.PORT, '127.0.0.1', function() {
+    log`WORKER listening`;
+  });

 } else if (process.argv[2] === 'cluster') {
+  log`MASTER started`;

   var totalWorkers = 2;

   // Send PID to testcase process
   var forkNum = 0;
   cluster.on('fork', function forkEvent(worker) {
+    log`MASTER on:fork ${worker.process.pid}`;

     // Send PID
     process.send({
@@ -26,6 +44,7 @@ if (cluster.isWorker) {

     // Stop listening when done
     if (++forkNum === totalWorkers) {
+      log`MASTER on:fork ${worker.process.pid} -- got totalWorkers`;
       cluster.removeListener('fork', forkEvent);
     }
   });
@@ -33,14 +52,17 @@ if (cluster.isWorker) {
   // Throw accidently error when all workers are listening
   var listeningNum = 0;
   cluster.on('listening', function listeningEvent() {
+    log`MASTER on:listening`;

     // When all workers are listening
     if (++listeningNum === totalWorkers) {
+      log`MASTER on:listening -- got totalWorkers`;
       // Stop listening
       cluster.removeListener('listening', listeningEvent);

       // throw accidently error
       process.nextTick(function() {
+        log`MASTER on:listening -- got totalWorkers -- about to throw`;
         console.error('about to throw');
         throw new Error('accidently error');
       });
@@ -52,7 +74,10 @@ if (cluster.isWorker) {
   cluster.fork();
   cluster.fork();

+  log`MASTER done forking`;
 } else {
+  log`=================================================================`;
+  log`TEST started`;
   // This is the testcase

   var fork = require('child_process').fork;
@@ -62,8 +87,10 @@ if (cluster.isWorker) {
       //this will throw an error if the process is dead
       process.kill(pid, 0);

+      log`TEST isAlive(${pid}) true`;
       return true;
     } catch (e) {
+      log`TEST isAlive(${pid}) false -- ${e.message}`;
       return false;
     }
   };
@@ -82,12 +109,14 @@ if (cluster.isWorker) {

     // Add worker pid to list and progress tracker
     if (data.cmd === 'worker') {
+      log`TEST on:message worker ${data.workerPID}`;
       workers.push(data.workerPID);
     }
   });

   // When cluster is dead
   master.on('exit', function(code) {
+    log`TEST on:exit code=${code}`;

     // Check that the cluster died accidently
     existMaster = !!code;
@@ -101,6 +130,7 @@ if (cluster.isWorker) {
     setTimeout(checkWorkers, timeout);

     function checkWorkers() {
+      log`TEST checkWorkers`;
       // When master is dead all workers should be dead to
       var alive = false;
       workers.forEach(function(pid) {
@@ -110,15 +140,16 @@ if (cluster.isWorker) {
       });

       // If a worker was alive this did not act as expected
-      existWorker = !alive;
+      existWorker = alive;
     }
   });

   process.once('exit', function() {
+    log`TEST once:exit`;
     var m = 'The master did not die after an error was throwed';
     assert.ok(existMaster, m);
     m = 'The workers did not die after an error in the master';
-    assert.ok(existWorker, m);
+    assert.ok(!existWorker, m);
   });

 }

With that, this is what I see in the build (where the test fails):

TEST-CLUSTER-MASTER-ERROR 10757 =================================================================
TEST-CLUSTER-MASTER-ERROR 10757 TEST started
TEST-CLUSTER-MASTER-ERROR 10762 MASTER started
TEST-CLUSTER-MASTER-ERROR 10762 MASTER done forking
TEST-CLUSTER-MASTER-ERROR 10762 MASTER on:fork 10767
TEST-CLUSTER-MASTER-ERROR 10762 MASTER on:fork 10772
TEST-CLUSTER-MASTER-ERROR 10762 MASTER on:fork 10772 -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 10757 TEST on:message worker 10767
TEST-CLUSTER-MASTER-ERROR 10757 TEST on:message worker 10772
TEST-CLUSTER-MASTER-ERROR 10767 WORKER started
TEST-CLUSTER-MASTER-ERROR 10772 WORKER started
TEST-CLUSTER-MASTER-ERROR 10767 WORKER listening
TEST-CLUSTER-MASTER-ERROR 10772 WORKER listening
TEST-CLUSTER-MASTER-ERROR 10762 MASTER on:listening
TEST-CLUSTER-MASTER-ERROR 10762 MASTER on:listening
TEST-CLUSTER-MASTER-ERROR 10762 MASTER on:listening -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 10762 MASTER on:listening -- got totalWorkers -- about to throw
TEST-CLUSTER-MASTER-ERROR 10757 TEST on:exit code=1
TEST-CLUSTER-MASTER-ERROR 10757 TEST checkWorkers
TEST-CLUSTER-MASTER-ERROR 10757 TEST isAlive(10767) true
TEST-CLUSTER-MASTER-ERROR 10757 TEST isAlive(10772) true
TEST-CLUSTER-MASTER-ERROR 10757 TEST once:exit

This is what I see when I run locally (where the test passes):

TEST-CLUSTER-MASTER-ERROR 7012 =================================================================
TEST-CLUSTER-MASTER-ERROR 7012 TEST started
TEST-CLUSTER-MASTER-ERROR 7017 MASTER started
TEST-CLUSTER-MASTER-ERROR 7017 MASTER done forking
TEST-CLUSTER-MASTER-ERROR 7017 MASTER on:fork 7022
TEST-CLUSTER-MASTER-ERROR 7012 TEST on:message worker 7022
TEST-CLUSTER-MASTER-ERROR 7017 MASTER on:fork 7027
TEST-CLUSTER-MASTER-ERROR 7012 TEST on:message worker 7027
TEST-CLUSTER-MASTER-ERROR 7017 MASTER on:fork 7027 -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 7022 WORKER started
TEST-CLUSTER-MASTER-ERROR 7022 WORKER listening
TEST-CLUSTER-MASTER-ERROR 7017 MASTER on:listening
TEST-CLUSTER-MASTER-ERROR 7027 WORKER started
TEST-CLUSTER-MASTER-ERROR 7027 WORKER listening
TEST-CLUSTER-MASTER-ERROR 7017 MASTER on:listening
TEST-CLUSTER-MASTER-ERROR 7017 MASTER on:listening -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 7017 MASTER on:listening -- got totalWorkers -- about to throw
TEST-CLUSTER-MASTER-ERROR 7012 TEST on:exit code=1
TEST-CLUSTER-MASTER-ERROR 7012 TEST checkWorkers
TEST-CLUSTER-MASTER-ERROR 7012 TEST isAlive(7022) false -- kill ESRCH
TEST-CLUSTER-MASTER-ERROR 7012 TEST isAlive(7027) false -- kill ESRCH
TEST-CLUSTER-MASTER-ERROR 7012 TEST once:exit

Trott · 2016-04-14T03:33:37Z

@nodejs/testing

drewfish · 2016-04-14T03:46:52Z

Odd... I just ran the same test on my macbook using off-the-shelf v4.4.3 and got a log of events very similar to the CI log above, except isAlive() returned false and the test passed. I suspect I'll have to do some deeper logging, perhaps adding additional listeners in the test or adding logging in the lib/cluster.js itself.

Trott · 2016-04-14T04:06:04Z

@nodejs/docker Not sure that Docker is a factor here, but not sure it isn't. Maybe it's something those folks have seen?

drewfish · 2016-04-14T05:21:16Z

Curious. I updated the logging to (a) specify which event listener is emitting the events, and (b) add listeners for a bunch more stuff.

With that, this is what I see in the build (where the test fails):

TEST-CLUSTER-MASTER-ERROR 10762 =================================================================
TEST-CLUSTER-MASTER-ERROR 10762 TEST started
TEST-CLUSTER-MASTER-ERROR 10767 MASTER started
TEST-CLUSTER-MASTER-ERROR 10767 MASTER done forking
TEST-CLUSTER-MASTER-ERROR 10767 MASTER cluster.on:fork 10772
TEST-CLUSTER-MASTER-ERROR 10767 MASTER cluster.on:fork 10777
TEST-CLUSTER-MASTER-ERROR 10767 MASTER cluster.on:fork 10777 -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 10762 TEST master.on:message worker 10772
TEST-CLUSTER-MASTER-ERROR 10762 TEST master.on:message worker 10777
TEST-CLUSTER-MASTER-ERROR 10767 MASTER worker:10772.on:online
TEST-CLUSTER-MASTER-ERROR 10767 MASTER worker:10777.on:online
TEST-CLUSTER-MASTER-ERROR 10772 WORKER started
TEST-CLUSTER-MASTER-ERROR 10777 WORKER started
TEST-CLUSTER-MASTER-ERROR 10772 WORKER listening
TEST-CLUSTER-MASTER-ERROR 10777 WORKER listening
TEST-CLUSTER-MASTER-ERROR 10767 MASTER worker:10772.on:listening
TEST-CLUSTER-MASTER-ERROR 10767 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR 10767 MASTER worker:10777.on:listening
TEST-CLUSTER-MASTER-ERROR 10767 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR 10767 MASTER cluster.on:listening -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 10767 MASTER cluster.on:listening -- got totalWorkers -- about to throw
TEST-CLUSTER-MASTER-ERROR 10762 TEST master.on:exit code=1
TEST-CLUSTER-MASTER-ERROR 10772 WORKER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR 10777 WORKER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR 10762 TEST checkWorkers
TEST-CLUSTER-MASTER-ERROR 10762 TEST isAlive(10772) true
TEST-CLUSTER-MASTER-ERROR 10762 TEST isAlive(10777) true
TEST-CLUSTER-MASTER-ERROR 10762 TEST process.once:exit

This is what I see when I run locally (where the test passes):

TEST-CLUSTER-MASTER-ERROR 22253 =================================================================
TEST-CLUSTER-MASTER-ERROR 22253 TEST started
TEST-CLUSTER-MASTER-ERROR 22258 MASTER started
TEST-CLUSTER-MASTER-ERROR 22258 MASTER done forking
TEST-CLUSTER-MASTER-ERROR 22258 MASTER cluster.on:fork 22263
TEST-CLUSTER-MASTER-ERROR 22258 MASTER cluster.on:fork 22268
TEST-CLUSTER-MASTER-ERROR 22258 MASTER cluster.on:fork 22268 -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 22253 TEST master.on:message worker 22263
TEST-CLUSTER-MASTER-ERROR 22253 TEST master.on:message worker 22268
TEST-CLUSTER-MASTER-ERROR 22258 MASTER worker:22263.on:online
TEST-CLUSTER-MASTER-ERROR 22258 MASTER worker:22268.on:online
TEST-CLUSTER-MASTER-ERROR 22263 WORKER started
TEST-CLUSTER-MASTER-ERROR 22268 WORKER started
TEST-CLUSTER-MASTER-ERROR 22263 WORKER listening
TEST-CLUSTER-MASTER-ERROR 22268 WORKER listening
TEST-CLUSTER-MASTER-ERROR 22258 MASTER worker:22263.on:listening
TEST-CLUSTER-MASTER-ERROR 22258 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR 22258 MASTER worker:22268.on:listening
TEST-CLUSTER-MASTER-ERROR 22258 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR 22258 MASTER cluster.on:listening -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 22258 MASTER cluster.on:listening -- got totalWorkers -- about to throw
TEST-CLUSTER-MASTER-ERROR 22253 TEST master.on:exit code=1
TEST-CLUSTER-MASTER-ERROR 22268 WORKER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR 22263 WORKER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR 22253 TEST checkWorkers
TEST-CLUSTER-MASTER-ERROR 22253 TEST isAlive(22263) false -- kill ESRCH
TEST-CLUSTER-MASTER-ERROR 22253 TEST isAlive(22268) false -- kill ESRCH
TEST-CLUSTER-MASTER-ERROR 22253 TEST process.once:exit

In both environments, the worker process.on('exit') happen before checkWorkers. Oddly, isAlive() is still showing true in the failing environment.

I think the next thing to check is the actual timing. Is the checkWorkers happening so quickly after the worker process.on(exit) that the process hasn't actually died yet? Is there something else that is causing the worker processes to hang around after that event?

santigimeno · 2016-04-14T08:03:29Z

@drewfish I've been able to reproduce the error in test-cluster-master-error.js, can you check if this patch fixes the test for you?

diff --git a/test/parallel/test-cluster-master-error.js b/test/parallel/test-cluster-master-error.js
index ae0f655..82d1fc8 100644
--- a/test/parallel/test-cluster-master-error.js
+++ b/test/parallel/test-cluster-master-error.js
@@ -69,7 +69,6 @@ if (cluster.isWorker) {
   };

   var existMaster = false;
-  var existWorker = false;

   // List all workers
   var workers = [];
@@ -91,14 +90,12 @@ if (cluster.isWorker) {

     // Check that the cluster died accidently
     existMaster = !!code;
+  });

-    // Give the workers time to shut down
-    var timeout = 200;
-    if (common.isAix) {
-      // AIX needs more time due to default exit performance
-      timeout = 1000;
-    }
-    setTimeout(checkWorkers, timeout);
+  process.once('exit', function() {
+    var existWorker = false;
+    var m = 'The master did not die after an error was throwed';
+    assert.ok(existMaster, m);

     function checkWorkers() {
       // When master is dead all workers should be dead to
@@ -112,11 +109,8 @@ if (cluster.isWorker) {
       // If a worker was alive this did not act as expected
       existWorker = !alive;
     }
-  });

-  process.once('exit', function() {
-    var m = 'The master did not die after an error was throwed';
-    assert.ok(existMaster, m);
+    checkWorkers();
     m = 'The workers did not die after an error in the master';
     assert.ok(existWorker, m);
   });

drewfish · 2016-04-14T17:25:01Z

@santigimeno I tried that patch (plus the logging stuff). I'm still able to reproduce the test failure. Here's the log:

TEST-CLUSTER-MASTER-ERROR 10764 ======================================================================
TEST-CLUSTER-MASTER-ERROR 10764 TEST started
TEST-CLUSTER-MASTER-ERROR 10769 MASTER started
TEST-CLUSTER-MASTER-ERROR 10769 MASTER done forking
TEST-CLUSTER-MASTER-ERROR 10769 MASTER cluster.on:fork worker=10774
TEST-CLUSTER-MASTER-ERROR 10769 MASTER cluster.on:fork worker=10779
TEST-CLUSTER-MASTER-ERROR 10769 MASTER cluster.on:fork worker=10779 -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 10764 TEST master.on:message worker=10774
TEST-CLUSTER-MASTER-ERROR 10764 TEST master.on:message worker=10779
TEST-CLUSTER-MASTER-ERROR 10769 MASTER worker:10774.on:online
TEST-CLUSTER-MASTER-ERROR 10769 MASTER worker:10779.on:online
TEST-CLUSTER-MASTER-ERROR 10774 WORKER started
TEST-CLUSTER-MASTER-ERROR 10779 WORKER started
TEST-CLUSTER-MASTER-ERROR 10774 WORKER listening
TEST-CLUSTER-MASTER-ERROR 10779 WORKER listening
TEST-CLUSTER-MASTER-ERROR 10769 MASTER worker:10774.on:listening
TEST-CLUSTER-MASTER-ERROR 10769 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR 10769 MASTER worker:10779.on:listening
TEST-CLUSTER-MASTER-ERROR 10769 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR 10769 MASTER cluster.on:listening -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR 10769 MASTER cluster.on:listening -- got totalWorkers -- about to throw
TEST-CLUSTER-MASTER-ERROR 10764 TEST master.on:exit code=1
TEST-CLUSTER-MASTER-ERROR 10779 WORKER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR 10774 WORKER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR 10764 TEST process.once:exit code=0
TEST-CLUSTER-MASTER-ERROR 10764 TEST isAlive(10774) true
TEST-CLUSTER-MASTER-ERROR 10764 TEST isAlive(10779) true

drewfish · 2016-04-15T19:25:14Z

Uggg.. no new insights, just more weirdness. I added timestamps and listeners for many events. The most interesting change is calling this in each process:

function installHandlers(who) {
  process.on('error',                      (e) => { log`${who} process.on:error ${e}`; });
  process.on('uncaughtException',          (e) => { log`${who} process.on:uncaughtException ${e}`; process.exit(20); });
  process.on('unhandledRejection', (reason, p) => { log`${who} process.on:unhandledRejection reason=${reason}`; process.exit(21); });
  process.on('exit',                    (code) => { log`${who} process.on:exit code=${code}`; });
  process.on('SIGPIPE', () => { log`${who} process.on:SIGPIPE`; });
  process.on('SIGCHLD', () => { log`${who} process.on:SIGCHLD`; });
  process.on('SIGTTIN', () => { log`${who} process.on:SIGTTIN`; });
  process.on('SIGTTOU', () => { log`${who} process.on:SIGTTOU`; });
  process.on('SIGTERM', () => { log`${who} process.on:SIGTERM`; process.exit(22); });
  process.on('SIGINT',  () => { log`${who} process.on:SIGINT`; process.exit(23); });
  process.on('SIGHUP',  () => { log`${who} process.on:SIGHUP`; process.exit(24); });
}

With that, here's what I get in the failing environment:

TEST-CLUSTER-MASTER-ERROR      43223 ======================================================================
TEST-CLUSTER-MASTER-ERROR    2 43223 TEST started
TEST-CLUSTER-MASTER-ERROR   62 43228 MASTER started
TEST-CLUSTER-MASTER-ERROR   71 43228 MASTER done forking
TEST-CLUSTER-MASTER-ERROR   73 43228 MASTER cluster.on:fork worker=43233
TEST-CLUSTER-MASTER-ERROR   75 43228 MASTER cluster.on:fork worker=43238
TEST-CLUSTER-MASTER-ERROR   75 43228 MASTER cluster.on:fork worker=43238 -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR   75 43223 TEST master.on:message worker=43233
TEST-CLUSTER-MASTER-ERROR   76 43223 TEST master.on:message worker=43238
TEST-CLUSTER-MASTER-ERROR  105 43228 MASTER worker:43233.on:online
TEST-CLUSTER-MASTER-ERROR  124 43233 WORKER started
TEST-CLUSTER-MASTER-ERROR  128 43228 MASTER worker:43238.on:online
TEST-CLUSTER-MASTER-ERROR  144 43233 WORKER listening
TEST-CLUSTER-MASTER-ERROR  145 43228 MASTER worker:43233.on:listening
TEST-CLUSTER-MASTER-ERROR  145 43228 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR  164 43238 WORKER started
TEST-CLUSTER-MASTER-ERROR  186 43238 WORKER listening
TEST-CLUSTER-MASTER-ERROR  187 43228 MASTER worker:43238.on:listening
TEST-CLUSTER-MASTER-ERROR  187 43228 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR  187 43228 MASTER cluster.on:listening -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR  187 43228 MASTER cluster.on:listening -- got totalWorkers -- about to throw
TEST-CLUSTER-MASTER-ERROR  190 43228 MASTER process.on:uncaughtException Error: accidently error
TEST-CLUSTER-MASTER-ERROR  191 43228 MASTER process.on:exit code=20
TEST-CLUSTER-MASTER-ERROR  194 43233 MASTER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR  194 43238 MASTER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR  194 43223 TEST master.on:exit code=20
TEST-CLUSTER-MASTER-ERROR  195 43223 TEST isAlive(43228) false -- kill ESRCH
TEST-CLUSTER-MASTER-ERROR  196 43223 TEST process.on:SIGCHLD
TEST-CLUSTER-MASTER-ERROR  199 43223 TEST process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR  199 43223 TEST process.once:exit code=0
TEST-CLUSTER-MASTER-ERROR  199 43223 TEST isAlive(43233) true
TEST-CLUSTER-MASTER-ERROR  199 43223 TEST isAlive(43238) true
TEST-CLUSTER-MASTER-ERROR  200 43223 TEST process.on:uncaughtException AssertionError: The workers did not die after an error in the master

Here's what I get in the succeeding environment:

TEST-CLUSTER-MASTER-ERROR      971 ======================================================================
TEST-CLUSTER-MASTER-ERROR    2 971 TEST started
TEST-CLUSTER-MASTER-ERROR   99 976 MASTER started
TEST-CLUSTER-MASTER-ERROR  119 976 MASTER done forking
TEST-CLUSTER-MASTER-ERROR  127 976 MASTER cluster.on:fork worker=981
TEST-CLUSTER-MASTER-ERROR  129 971 TEST master.on:message worker=981
TEST-CLUSTER-MASTER-ERROR  129 976 MASTER cluster.on:fork worker=984
TEST-CLUSTER-MASTER-ERROR  130 971 TEST master.on:message worker=984
TEST-CLUSTER-MASTER-ERROR  130 976 MASTER cluster.on:fork worker=984 -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR  183 976 MASTER worker:984.on:online
TEST-CLUSTER-MASTER-ERROR  188 976 MASTER worker:981.on:online
TEST-CLUSTER-MASTER-ERROR  212 984 WORKER started
TEST-CLUSTER-MASTER-ERROR  228 981 WORKER started
TEST-CLUSTER-MASTER-ERROR  243 984 WORKER listening
TEST-CLUSTER-MASTER-ERROR  244 976 MASTER worker:984.on:listening
TEST-CLUSTER-MASTER-ERROR  245 976 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR  251 981 WORKER listening
TEST-CLUSTER-MASTER-ERROR  252 976 MASTER worker:981.on:listening
TEST-CLUSTER-MASTER-ERROR  253 976 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR  253 976 MASTER cluster.on:listening -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR  253 976 MASTER cluster.on:listening -- got totalWorkers -- about to throw
TEST-CLUSTER-MASTER-ERROR  259 976 MASTER process.on:uncaughtException Error: accidently error
TEST-CLUSTER-MASTER-ERROR  261 976 MASTER process.on:exit code=20
TEST-CLUSTER-MASTER-ERROR  265 981 MASTER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR  268 984 MASTER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR  268 971 TEST master.on:exit code=20
TEST-CLUSTER-MASTER-ERROR  269 971 TEST isAlive(976) false -- kill ESRCH
TEST-CLUSTER-MASTER-ERROR  271 971 TEST process.on:SIGCHLD
TEST-CLUSTER-MASTER-ERROR  277 971 TEST process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR  277 971 TEST process.once:exit code=0
TEST-CLUSTER-MASTER-ERROR  277 971 TEST isAlive(981) false -- kill ESRCH
TEST-CLUSTER-MASTER-ERROR  278 971 TEST isAlive(984) false -- kill ESRCH

(The multiple MASTER process.on:exit is confusing. I'm only registering one process.on('exit') handler in the master which appears to be called three times. Not sure if this behavior is related to the leaking child processes, but probably isn't because it's consistent across the passing and failing environments.)

drewfish · 2016-04-15T19:54:36Z

@bengl suggested I try the same test on 0.12. Here's the same test code (with new language features downgraded) run on 0.12.13:

TEST-CLUSTER-MASTER-ERROR      49075 ======================================================================
TEST-CLUSTER-MASTER-ERROR    1 49075 TEST started
TEST-CLUSTER-MASTER-ERROR   99 49076 MASTER started
TEST-CLUSTER-MASTER-ERROR  107 49076 MASTER done forking
TEST-CLUSTER-MASTER-ERROR  108 49076 MASTER cluster.on:fork worker=49077
TEST-CLUSTER-MASTER-ERROR  108 49076 MASTER cluster.on:fork worker=49078
TEST-CLUSTER-MASTER-ERROR  109 49076 MASTER cluster.on:fork worker=49078 -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR  109 49075 TEST master.on:message worker=49077
TEST-CLUSTER-MASTER-ERROR  110 49075 TEST master.on:message worker=49078
TEST-CLUSTER-MASTER-ERROR  182 49076 MASTER worker:49078.on:online
TEST-CLUSTER-MASTER-ERROR  184 49076 MASTER worker:49077.on:online
TEST-CLUSTER-MASTER-ERROR  196 49078 WORKER started
TEST-CLUSTER-MASTER-ERROR  198 49077 WORKER started
TEST-CLUSTER-MASTER-ERROR  213 49078 WORKER listening
TEST-CLUSTER-MASTER-ERROR  213 49077 WORKER listening
TEST-CLUSTER-MASTER-ERROR  213 49076 MASTER worker:49078.on:listening
TEST-CLUSTER-MASTER-ERROR  214 49076 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR  214 49076 MASTER worker:49077.on:listening
TEST-CLUSTER-MASTER-ERROR  214 49076 MASTER cluster.on:listening
TEST-CLUSTER-MASTER-ERROR  214 49076 MASTER cluster.on:listening -- got totalWorkers
TEST-CLUSTER-MASTER-ERROR  214 49076 MASTER cluster.on:listening -- got totalWorkers -- about to throw
TEST-CLUSTER-MASTER-ERROR  216 49076 MASTER process.on:uncaughtException Error: accidently error
TEST-CLUSTER-MASTER-ERROR  216 49076 MASTER process.on:exit code=20
TEST-CLUSTER-MASTER-ERROR  219 49077 MASTER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR  219 49078 MASTER process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR  219 49075 TEST master.on:exit code=20
TEST-CLUSTER-MASTER-ERROR  219 49075 TEST isAlive(49076) false -- kill ESRCH
TEST-CLUSTER-MASTER-ERROR  220 49075 TEST process.on:SIGCHLD
TEST-CLUSTER-MASTER-ERROR  223 49075 TEST process.on:exit code=0
TEST-CLUSTER-MASTER-ERROR  223 49075 TEST process.once:exit code=0
TEST-CLUSTER-MASTER-ERROR  224 49075 TEST isAlive(49077) false -- kill ESRCH
TEST-CLUSTER-MASTER-ERROR  224 49075 TEST isAlive(49078) false -- kill ESRCH

drewfish · 2016-04-15T20:31:41Z

Hmm... worker.process.once('exit') seems to be tunneling the event to work.emit('exit') but not worker.process.emit('exit'):
https://github.com/nodejs/node/blob/v4.x/lib/cluster.js#L347-L362
Nevermind, thinking about it it shouldn't.

santigimeno · 2016-04-19T22:48:10Z

@drewfish I've been able to reproduce the error more or less consistently by increasing the number of workers. I think the problem might be that sometimes the workers are not dying immediately (at least not in the 200ms after the master dies. Code here: https://github.com/nodejs/node/blob/v4.x/lib/cluster.js#L534-L540). Have you tried increasing the checkWorkers timeout?

drewfish · 2016-04-20T16:56:56Z

I had bumped up the timeout to 10 seconds but I was still seeing the failure.

drewfish · 2016-04-20T21:07:05Z

Yeah, I just added back a setInterval() and had it checkWorkers() every second and after 10 seconds all children were still alive.

I think I'm going to have to wire some logging into lib/cluster and start looking at what's going on there.

drewfish · 2016-04-27T01:13:59Z

Rats. I just ran a build on 6.0.0 (w/out any custom modifications) and saw the same failures:

00:31:13.921 not ok 114 test-cluster-master-error.js
00:31:13.921 # 
00:31:13.921 # assert.js:90
00:31:13.922 #   throw new assert.AssertionError({
00:31:13.922 #   ^
00:31:13.922 # AssertionError: The workers did not die after an error in the master
00:31:13.922 #     at process.<anonymous> (/var/builds/workspace/111275-v3-component/BUILD_CONTAINER/rhel6/label/DOCKER-LOW/app_root/node.git/test/parallel/test-cluster-master-error.js:121:12)
00:31:13.922 #     at process.g (events.js:286:16)
00:31:13.922 #     at emitOne (events.js:101:20)
00:31:13.923 #     at process.emit (events.js:188:7)
00:31:13.923   ---
00:31:13.923   duration_ms: 0.610
00:31:13.923   ...
00:31:15.536 not ok 115 test-cluster-master-kill.js
00:31:15.536 # 
00:31:15.536 # assert.js:90
00:31:15.537 #   throw new assert.AssertionError({
00:31:15.537 #   ^
00:31:15.537 # AssertionError: worker was alive after master died
00:31:15.537 #     at process.<anonymous> (/var/builds/workspace/111275-v3-component/BUILD_CONTAINER/rhel6/label/DOCKER-LOW/app_root/node.git/test/parallel/test-cluster-master-kill.js:79:12)
00:31:15.537 #     at process.g (events.js:286:16)
00:31:15.537 #     at emitOne (events.js:101:20)
00:31:15.537 #     at process.emit (events.js:188:7)
00:31:15.538   ---
00:31:15.538   duration_ms: 1.613
00:31:15.598   ...

ncopa · 2016-05-04T14:00:57Z

You mentioned it happens in docker?

If the master dies before the workers, then will the workers become orphans and it will be the job of init (pid 1) to reap the exited worker process. (that is what takes 200ms). If you run it in docker, then there will the pid 1 not be an init which is designed to clean up orphans, so the workers ends up as zombies.

The point is: something needs to reap the workers, using wait(2) or waitpid(2). Ideally, the master process should stay til all workers have exited and only after that it can exit, but to my understanding there are no API for that in node.

Try run tini as first process in your container and see if that helps.

EDIT: fix spelling of tini

stefanmb · 2016-05-04T17:35:22Z

@ncopa Thanks for pointing out the init issue! I wasn't aware of this limitation in Docker, but it seems there has already been some discussion on this topic in nodejs/build-containers#19, @drewfish maybe the workaround mentioned there ("my_init" script) can resolve your issue.

drewfish · 2016-05-05T19:49:43Z

Yeah, this smells about right, thanks @ncopa! I'll give it a try (either tini or some other reaper).

Trott · 2016-05-11T23:32:37Z

@drewfish Is this issue resolved? Or not exactly?

cjihrig · 2016-06-01T14:53:33Z

ping @drewfish. Can this be closed?

bengl · 2016-06-01T23:48:10Z

I built upon the work of @drewfish internally and found that, indeed, it's a custom init of ours that isn't correctly reaping workers.

Thanks @ncopa for pointing us in that direction! Closing this now, as we'll resolve it internally.

cjihrig · 2016-06-02T01:23:02Z

Awesome. Thanks for following up @bengl

mscdex added cluster Issues and PRs related to the cluster subsystem. test Issues and PRs related to the tests. labels Apr 14, 2016

drewfish mentioned this issue May 3, 2016

test: make cluster test more time tolerant on more platforms #5056

Closed

drewfish mentioned this issue May 6, 2016

test: avoid test-cluster-master-kill flakiness #6531

Closed

3 tasks

bengl closed this as completed Jun 1, 2016

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

test-cluster-master-{error,kill} fail only in weird CI environment #6193

test-cluster-master-{error,kill} fail only in weird CI environment #6193

drewfish commented Apr 14, 2016 •

edited

Loading

drewfish commented Apr 14, 2016

Trott commented Apr 14, 2016

drewfish commented Apr 14, 2016

Trott commented Apr 14, 2016

drewfish commented Apr 14, 2016

santigimeno commented Apr 14, 2016

drewfish commented Apr 14, 2016

drewfish commented Apr 15, 2016

drewfish commented Apr 15, 2016

drewfish commented Apr 15, 2016 •

edited

Loading

santigimeno commented Apr 19, 2016

drewfish commented Apr 20, 2016

drewfish commented Apr 20, 2016

drewfish commented Apr 27, 2016

ncopa commented May 4, 2016 •

edited

Loading

stefanmb commented May 4, 2016

drewfish commented May 5, 2016

Trott commented May 11, 2016

cjihrig commented Jun 1, 2016

bengl commented Jun 1, 2016

cjihrig commented Jun 2, 2016

test-cluster-master-{error,kill} fail only in weird CI environment #6193

test-cluster-master-{error,kill} fail only in weird CI environment #6193

Comments

drewfish commented Apr 14, 2016 • edited Loading

drewfish commented Apr 14, 2016

Trott commented Apr 14, 2016

drewfish commented Apr 14, 2016

Trott commented Apr 14, 2016

drewfish commented Apr 14, 2016

santigimeno commented Apr 14, 2016

drewfish commented Apr 14, 2016

drewfish commented Apr 15, 2016

drewfish commented Apr 15, 2016

drewfish commented Apr 15, 2016 • edited Loading

santigimeno commented Apr 19, 2016

drewfish commented Apr 20, 2016

drewfish commented Apr 20, 2016

drewfish commented Apr 27, 2016

ncopa commented May 4, 2016 • edited Loading

stefanmb commented May 4, 2016

drewfish commented May 5, 2016

Trott commented May 11, 2016

cjihrig commented Jun 1, 2016

bengl commented Jun 1, 2016

cjihrig commented Jun 2, 2016

drewfish commented Apr 14, 2016 •

edited

Loading

drewfish commented Apr 15, 2016 •

edited

Loading

ncopa commented May 4, 2016 •

edited

Loading