Feature/python test runner (#417)

dothebart · KVS85 · jsteemann · web-flow · commit ced4533b232d · 2022-08-24T10:12:51.000+02:00
* start implementing a python launch controller * make it work for the first time * try launching outside of oskar. * no more pipes needed * adjust report directory * fix paths, thread naming. * fallback if no env is configured * lint * more work on cluster etc * silence, proper error message for missing variable * convert params * lint * fix slot * fix arangosh.conf, launching of subsequent testruns * try to launch it from fish * implement 7zip * add modules to the docker container * more printing * fix handling * Add pip3 * Fix typo * Typo 2 * handle INNERWORKDIR * fix missing line break * export settings * fix typo * on windows skip !windows tests * lint, refactor, simplify * install 7z * export core directory * work on fish integration * similarize for new python job scheduler * work on reprot generating * try to implement timeout * also upload 7z and txt * also upload 7z and txt * fix deadline * fix workspace handling * fix temporary directory handling * make sure out temp directory exists * RTFM fail * don't put it to the workspace * implement gtest invoking * cleanup * sort, lint * prefer INNERWORKDIR * implement writing test.log * implement html report * bring back function deletet to early * install the windows boomerang handler on top level * fix include * fix reference * print before killing shit * work on timeout * finish deadline handling, rename script * fix exit code handling * lint * thanks @mpoeter for ps aid * make the thread identifier the test plus a growing number * implement central final deadline, which will kick in after 2 minutes * remove debug output * use /usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/snap/bin to locate python * wintendo next try * wintendo next try * wintendo go home * fix calculation of hard time limit * make sure nobody changes the exit code to good * add monkey patches * cleanup deadline * ignore exceptions if no process is there * deadline handling: prioritize incomming lines over timeout counting * fix directory handling * work on result presentation * cleanup * let the file remain open for further info * fix environment variable handling * documentation * fix port handling * work on deadline * fix hard deadline handling * make it 20s * need more time * list processes so we may guess whats actually going on * kill all, then waitpid all * make threads provide half a slot. * be sure to catch * resume just in case, then kill * resume just in case, then kill * ignore resume errors * increase volume * lint * lint * catch more * add multipliers * more load, print load avg * fix sorting by prio - biggest values first * cleanup crash report for size * if test indicates its been crashing create report as well. * more threat to the machine. * timeout * fix typo * delete tzdata subdir first * use load and sockets for throttle control * install required python libs * only see for load [0, 1] * increase container version * anounce deadline at start * don't print to logfile * give better feedback if arangosh fails to launch in first place, thangs @maierlars for bringing up the topic * Update helper.linux.fish * tschuess ruby * re-sync to be stock RTA * fix container numbers, adjust #3 * sync to rta * resync * this is not needed anymore * add --fix-missing * fresh python? * revert to tar.gz * chaos tests in nightlies demand for longer timeouts, since tests run longer. * Update README.md Co-authored-by: Jan <jsteemann@users.noreply.github.com> * Update README.md Co-authored-by: Jan <jsteemann@users.noreply.github.com> * Update README.md Co-authored-by: Jan <jsteemann@users.noreply.github.com> * Update README.md Co-authored-by: Jan <jsteemann@users.noreply.github.com> * Update README.md Co-authored-by: Jan <jsteemann@users.noreply.github.com> * remove more old stuff * ignore encoding errors * increase timeout to hard self kill * switch to one environment variable name * env * limit the amount of coredumps * ignore access denied to open sockets * if we need to wait for the system to cool down on start... * make sure we don't come back good if nothing launched at all * them tiny boxes need more time * need more time * add deadline status to testfailurs.txt * need more time * beautify testfailures.txt * give machine estimate reasons at the start of the run * case may matter * one more environment variable Co-authored-by: Vadim <vadim@arangodb.com> Co-authored-by: Jan <jsteemann@users.noreply.github.com>
diff --git a/jenkins/helper/test_launch_controller.py b/jenkins/helper/test_launch_controller.py
@@ -75,6 +75,8 @@ def get_workspace():
     return Path.cwd() / 'work'
 
 TEMP = Path("/tmp/")
+if 'TMP' in os.environ:
+    TEMP = Path(os.environ['TMP'])
 if 'TEMP' in os.environ:
     TEMP = Path(os.environ['TEMP'])
 if 'INNERWORKDIR' in os.environ:
@@ -302,9 +304,11 @@ def __init__(self, definition_file):
         self.timeout = 1800
         if 'timeLimit'.upper() in os.environ:
             self.timeout = int(os.environ['timeLimit'.upper()])
+        elif 'timeLimit' in os.environ:
+            self.timeout = int(os.environ['timeLimit'])
         if psutil.cpu_count() <= 8:
-            print("Small machine detected, trippling deadline!")
-            self.timeout *= 3
+            print("Small machine detected, quadrupling deadline!")
+            self.timeout *= 4
         self.deadline = datetime.now() + timedelta(seconds=self.timeout)
         self.hard_deadline = datetime.now() + timedelta(seconds=self.timeout + 660)
         if definition_file.is_file():
@@ -315,7 +319,21 @@ def __init__(self, definition_file):
             for target in ['RelWithdebInfo', 'Debug']:
                 if (bin_dir / target).exists():
                     bin_dir = bin_dir / target
-
+        self.no_threads = psutil.cpu_count()
+        self.available_slots = round(self.no_threads * 2) #logical=False)
+        if IS_WINDOWS:
+            self.max_load = 0.85
+            self.max_load1 = 0.75
+        else:
+            self.max_load = self.no_threads * 0.9
+            self.max_load1 = self.no_threads * 0.9
+        # self.available_slots += (psutil.cpu_count(logical=True) - self.available_slots) / 2
+        print(f"""Machine Info:
+ - {psutil.cpu_count(logical=False)} Cores / {psutil.cpu_count(logical=True)} Threads
+ - {psutil.virtual_memory()} virtual Memory
+ - {self.max_load} / {self.max_load1} configured maximum load 0 / 1
+ - {self.available_slots} test slots
+""")
         self.cfgdir = base_source_dir / 'etc' / 'relative'
         self.bin_dir = bin_dir
         self.base_path = base_source_dir
@@ -396,15 +414,6 @@ def __init__(self, cfg):
         self.cfg = cfg
         self.deadline_reached = False
         self.slot_lock = Lock()
-        self.no_threads = psutil.cpu_count()
-        self.available_slots = round(self.no_threads * 2) #logical=False)
-        if IS_WINDOWS:
-            self.max_load = 0.85
-            self.max_load1 = 0.75
-        else:
-            self.max_load = self.no_threads * 0.9
-            self.max_load1 = self.no_threads * 0.9
-        # self.available_slots += (psutil.cpu_count(logical=True) - self.available_slots) / 2
         self.used_slots = 0
         self.scenarios = []
         self.arangosh = ArangoshExecutor(self.cfg, self.slot_lock)
@@ -429,7 +438,7 @@ def done_job(self, parallelity):
 
     def launch_next(self, offset, counter):
         """ launch one testing job """
-        if self.scenarios[offset].parallelity > (self.available_slots - self.used_slots):
+        if self.scenarios[offset].parallelity > (self.cfg.available_slots - self.used_slots):
             return False
         try:
             sock_count = get_socket_count()
@@ -439,8 +448,8 @@ def launch_next(self, offset, counter):
         except psutil.AccessDenied:
             pass
         load = psutil.getloadavg()
-        if ((load[0] > self.max_load) or
-            (load[1] > self.max_load1)):
+        if ((load[0] > self.cfg.max_load) or
+            (load[1] > self.cfg.max_load1)):
             print(F"Load to high: {str(load)} waiting before spawning more")
             return False
         with self.slot_lock:
@@ -514,6 +523,7 @@ def handle_deadline(self):
 
     def testing_runner(self):
         """ run testing suites """
+        # pylint: disable=too-many-branches
         mem = psutil.virtual_memory()
         os.environ['ARANGODB_OVERRIDE_DETECTED_TOTAL_MEMORY'] = str(int((mem.total * 0.8) / 9))
 
@@ -535,8 +545,8 @@ def testing_runner(self):
             used_slots = 0
             with self.slot_lock:
                 used_slots = self.used_slots
-            if self.available_slots > used_slots and start_offset < len(self.scenarios):
-                print(f"Launching more: {self.available_slots} > {used_slots} {counter}")
+            if self.cfg.available_slots > used_slots and start_offset < len(self.scenarios):
+                print(f"Launching more: {self.cfg.available_slots} > {used_slots} {counter}")
                 sys.stdout.flush()
                 if self.launch_next(start_offset, counter):
                     start_offset += 1
@@ -573,7 +583,9 @@ def generate_report_txt(self):
         for testrun in self.scenarios:
             print(testrun)
             if testrun.crashed or not testrun.success:
-                summary += testrun.summary
+                summary += f"\n=== {testrun.name} ===\n{testrun.summary}"
+            if testrun.finish is None:
+                summary += f"\n=== {testrun.name} ===\nhasn't been launched at all!"
         print(summary)
         (get_workspace() / 'testfailures.txt').write_text(summary)