Skip to content

Commit dc5b6be

Browse files
dothebartKVS85jsteemannMarkus Pfeiffer
authored
Feature/python test runner (#419)
* start implementing a python launch controller * make it work for the first time * try launching outside of oskar. * no more pipes needed * adjust report directory * fix paths, thread naming. * fallback if no env is configured * lint * more work on cluster etc * silence, proper error message for missing variable * convert params * lint * fix slot * fix arangosh.conf, launching of subsequent testruns * try to launch it from fish * implement 7zip * add modules to the docker container * more printing * fix handling * Add pip3 * Fix typo * Typo 2 * handle INNERWORKDIR * fix missing line break * export settings * fix typo * on windows skip !windows tests * lint, refactor, simplify * install 7z * export core directory * work on fish integration * similarize for new python job scheduler * work on reprot generating * try to implement timeout * also upload 7z and txt * also upload 7z and txt * fix deadline * fix workspace handling * fix temporary directory handling * make sure out temp directory exists * RTFM fail * don't put it to the workspace * implement gtest invoking * cleanup * sort, lint * prefer INNERWORKDIR * implement writing test.log * implement html report * bring back function deletet to early * install the windows boomerang handler on top level * fix include * fix reference * print before killing shit * work on timeout * finish deadline handling, rename script * fix exit code handling * lint * thanks @mpoeter for ps aid * make the thread identifier the test plus a growing number * implement central final deadline, which will kick in after 2 minutes * remove debug output * use /usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/snap/bin to locate python * wintendo next try * wintendo next try * wintendo go home * fix calculation of hard time limit * make sure nobody changes the exit code to good * add monkey patches * cleanup deadline * ignore exceptions if no process is there * deadline handling: prioritize incomming lines over timeout counting * fix directory handling * work on result presentation * cleanup * let the file remain open for further info * fix environment variable handling * documentation * fix port handling * work on deadline * fix hard deadline handling * make it 20s * need more time * list processes so we may guess whats actually going on * kill all, then waitpid all * make threads provide half a slot. * be sure to catch * resume just in case, then kill * resume just in case, then kill * ignore resume errors * increase volume * lint * lint * catch more * add multipliers * more load, print load avg * fix sorting by prio - biggest values first * cleanup crash report for size * if test indicates its been crashing create report as well. * more threat to the machine. * timeout * fix typo * delete tzdata subdir first * use load and sockets for throttle control * install required python libs * only see for load [0, 1] * increase container version * anounce deadline at start * don't print to logfile * give better feedback if arangosh fails to launch in first place, thangs @maierlars for bringing up the topic * Update helper.linux.fish * tschuess ruby * re-sync to be stock RTA * fix container numbers, adjust #3 * sync to rta * resync * this is not needed anymore * add --fix-missing * fresh python? * revert to tar.gz * chaos tests in nightlies demand for longer timeouts, since tests run longer. * Update README.md Co-authored-by: Jan <[email protected]> * Update README.md Co-authored-by: Jan <[email protected]> * Update README.md Co-authored-by: Jan <[email protected]> * Update README.md Co-authored-by: Jan <[email protected]> * Update README.md Co-authored-by: Jan <[email protected]> * remove more old stuff * ignore encoding errors * increase timeout to hard self kill * switch to one environment variable name * env * limit the amount of coredumps * ignore access denied to open sockets * if we need to wait for the system to cool down on start... * make sure we don't come back good if nothing launched at all * them tiny boxes need more time * need more time * add deadline status to testfailurs.txt * need more time * beautify testfailures.txt * give machine estimate reasons at the start of the run * case may matter * one more environment variable * anounce test directory * switch sequence, print first * one more var exported * add disk i/o to the output * better work with M1 performance cores * print other sequence; enable more load[1] * more threads doesn't cut it * print platform * precise M1 detection * two places on mac to collect cores * properly append * fix default directory * use iso-ish datetime format for filenames Co-authored-by: Vadim <[email protected]> Co-authored-by: Jan <[email protected]> Co-authored-by: Markus Pfeiffer <[email protected]>
1 parent 858858c commit dc5b6be

File tree

1 file changed

+41
-21
lines changed

1 file changed

+41
-21
lines changed

jenkins/helper/test_launch_controller.py

+41-21
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
""" read test definition, and generate the output for the specified target """
33
import argparse
44
from datetime import datetime, timedelta
5+
import platform
56
import os
67
from pathlib import Path
7-
import platform
88
import pprint
99
import signal
1010
import sys
@@ -37,6 +37,14 @@
3737

3838
IS_WINDOWS = platform.win32_ver()[0] != ""
3939
IS_MAC = platform.mac_ver()[0] != ""
40+
if IS_MAC:
41+
# Put us to the performance cores:
42+
# https://apple.stackexchange.com/questions/443713
43+
from os import setpriority
44+
PRIO_DARWIN_THREAD = 0b0011
45+
PRIO_DARWIN_PROCESS = 0b0100
46+
PRIO_DARWIN_BG = 0x1000
47+
setpriority(PRIO_DARWIN_PROCESS, 0, 0)
4048

4149
pp = pprint.PrettyPrinter(indent=4)
4250

@@ -309,9 +317,22 @@ def __init__(self, definition_file):
309317
self.timeout = int(os.environ['timeLimit'.upper()])
310318
elif 'timeLimit' in os.environ:
311319
self.timeout = int(os.environ['timeLimit'])
312-
if psutil.cpu_count() <= 8:
320+
321+
if psutil.cpu_count(logical=False) <= 8:
313322
print("Small machine detected, quadrupling deadline!")
314323
self.timeout *= 4
324+
self.no_threads = psutil.cpu_count()
325+
self.available_slots = round(self.no_threads * 2) #logical=False)
326+
if IS_MAC and platform.processor() == "arm" and psutil.cpu_count() == 8:
327+
self.no_threads = 6 # M1 only has 4 performance cores
328+
self.available_slots = 10
329+
if IS_WINDOWS:
330+
self.max_load = 0.85
331+
self.max_load1 = 0.75
332+
else:
333+
self.max_load = self.no_threads * 0.9
334+
self.max_load1 = self.no_threads * 0.95
335+
315336
self.deadline = datetime.now() + timedelta(seconds=self.timeout)
316337
self.hard_deadline = datetime.now() + timedelta(seconds=self.timeout + 660)
317338
if definition_file.is_file():
@@ -322,21 +343,14 @@ def __init__(self, definition_file):
322343
for target in ['RelWithdebInfo', 'Debug']:
323344
if (bin_dir / target).exists():
324345
bin_dir = bin_dir / target
325-
self.no_threads = psutil.cpu_count()
326-
self.available_slots = round(self.no_threads * 2) #logical=False)
327-
if IS_WINDOWS:
328-
self.max_load = 0.85
329-
self.max_load1 = 0.75
330-
else:
331-
self.max_load = self.no_threads * 0.9
332-
self.max_load1 = self.no_threads * 0.9
333-
# self.available_slots += (psutil.cpu_count(logical=True) - self.available_slots) / 2
334346
print(f"""Machine Info:
335347
- {psutil.cpu_count(logical=False)} Cores / {psutil.cpu_count(logical=True)} Threads
348+
- {platform.processor()} processor architecture
336349
- {psutil.virtual_memory()} virtual Memory
337350
- {self.max_load} / {self.max_load1} configured maximum load 0 / 1
338351
- {self.available_slots} test slots
339352
- {str(TEMP)} - temporary directory
353+
- current Disk I/O: {str(psutil.disk_io_counters())}
340354
""")
341355
self.cfgdir = base_source_dir / 'etc' / 'relative'
342356
self.bin_dir = bin_dir
@@ -426,13 +440,15 @@ def __init__(self, cfg):
426440
self.success = True
427441
self.crashed = False
428442
self.cluster = False
443+
self.datetime_format = "%Y-%m-%dT%H%M%SZ"
429444

430445
def print_active(self):
431446
""" output currently active testsuites """
432447
with self.slot_lock:
433-
print("Running: " + str(self.running_suites) +
448+
print(str(psutil.getloadavg()) + "<= Load " +
449+
"Running: " + str(self.running_suites) +
434450
" => Active Slots: " + str(self.used_slots) +
435-
" => Load: " + str(psutil.getloadavg()))
451+
" => Disk I/O: " + str(psutil.disk_io_counters()))
436452
sys.stdout.flush()
437453

438454
def done_job(self, parallelity):
@@ -454,7 +470,8 @@ def launch_next(self, offset, counter):
454470
load = psutil.getloadavg()
455471
if ((load[0] > self.cfg.max_load) or
456472
(load[1] > self.cfg.max_load1)):
457-
print(F"Load to high: {str(load)} waiting before spawning more")
473+
print(F"{str(load)} <= Load to high; waiting before spawning more - Disk I/O: " +
474+
str(psutil.disk_io_counters()))
458475
return False
459476
with self.slot_lock:
460477
self.used_slots += self.scenarios[offset].parallelity
@@ -617,13 +634,16 @@ def generate_crash_report(self):
617634
core_max_count = 15 # 3 cluster instances
618635
core_dir = Path.cwd()
619636
core_pattern = "core*"
637+
if IS_WINDOWS:
638+
core_pattern = "*.dmp"
639+
system_corefiles = []
620640
if 'COREDIR' in os.environ:
621641
core_dir = Path(os.environ['COREDIR'])
642+
else:
643+
core_dir = Path('/var/tmp/') # default to coreDirectory in testing.js
622644
if IS_MAC:
623-
core_dir = Path('/cores')
624-
if IS_WINDOWS:
625-
core_pattern = "*.dmp"
626-
files = sorted(core_dir.glob(core_pattern))
645+
system_corefiles = sorted(Path('/cores').glob(core_pattern))
646+
files = sorted(core_dir.glob(core_pattern)) + system_corefiles
627647
if len(files) > core_max_count:
628648
count = 0
629649
for one_crash_file in files:
@@ -633,7 +653,7 @@ def generate_crash_report(self):
633653
one_crash_file.unlink(missing_ok=True)
634654
is_empty = len(files) == 0
635655
if self.crashed or not is_empty:
636-
crash_report_file = get_workspace() / datetime.now(tz=None).strftime("crashreport-%d-%b-%YT%H.%M.%SZ")
656+
crash_report_file = get_workspace() / datetime.now(tz=None).strftime(f"crashreport-{self.datetime_format}")
637657
print("creating crashreport: " + str(crash_report_file))
638658
sys.stdout.flush()
639659
shutil.make_archive(str(crash_report_file),
@@ -642,7 +662,7 @@ def generate_crash_report(self):
642662
core_dir.name,
643663
True)
644664
self.cleanup_unneeded_binary_files()
645-
binary_report_file = get_workspace() / datetime.now(tz=None).strftime("binaries-%d-%b-%YT%H.%M.%SZ")
665+
binary_report_file = get_workspace() / datetime.now(tz=None).strftime(f"binaries-{self.datetime_format}")
646666
print("creating crashreport binary support zip: " + str(binary_report_file))
647667
sys.stdout.flush()
648668
shutil.make_archive(str(binary_report_file),
@@ -657,7 +677,7 @@ def generate_crash_report(self):
657677

658678
def generate_test_report(self):
659679
""" regular testresults zip """
660-
tarfile = get_workspace() / datetime.now(tz=None).strftime("testreport-%d-%b-%YT%H.%M.%SZ")
680+
tarfile = get_workspace() / datetime.now(tz=None).strftime(f"testreport-{self.datetime_format}")
661681
print("Creating " + str(tarfile))
662682
sys.stdout.flush()
663683
shutil.make_archive(self.cfg.run_root / 'innerlogs',

0 commit comments

Comments
 (0)