Skip to content

Commit ced4533

Browse files
dothebartKVS85jsteemann
authored
Feature/python test runner (#417)
* start implementing a python launch controller * make it work for the first time * try launching outside of oskar. * no more pipes needed * adjust report directory * fix paths, thread naming. * fallback if no env is configured * lint * more work on cluster etc * silence, proper error message for missing variable * convert params * lint * fix slot * fix arangosh.conf, launching of subsequent testruns * try to launch it from fish * implement 7zip * add modules to the docker container * more printing * fix handling * Add pip3 * Fix typo * Typo 2 * handle INNERWORKDIR * fix missing line break * export settings * fix typo * on windows skip !windows tests * lint, refactor, simplify * install 7z * export core directory * work on fish integration * similarize for new python job scheduler * work on reprot generating * try to implement timeout * also upload 7z and txt * also upload 7z and txt * fix deadline * fix workspace handling * fix temporary directory handling * make sure out temp directory exists * RTFM fail * don't put it to the workspace * implement gtest invoking * cleanup * sort, lint * prefer INNERWORKDIR * implement writing test.log * implement html report * bring back function deletet to early * install the windows boomerang handler on top level * fix include * fix reference * print before killing shit * work on timeout * finish deadline handling, rename script * fix exit code handling * lint * thanks @mpoeter for ps aid * make the thread identifier the test plus a growing number * implement central final deadline, which will kick in after 2 minutes * remove debug output * use /usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/snap/bin to locate python * wintendo next try * wintendo next try * wintendo go home * fix calculation of hard time limit * make sure nobody changes the exit code to good * add monkey patches * cleanup deadline * ignore exceptions if no process is there * deadline handling: prioritize incomming lines over timeout counting * fix directory handling * work on result presentation * cleanup * let the file remain open for further info * fix environment variable handling * documentation * fix port handling * work on deadline * fix hard deadline handling * make it 20s * need more time * list processes so we may guess whats actually going on * kill all, then waitpid all * make threads provide half a slot. * be sure to catch * resume just in case, then kill * resume just in case, then kill * ignore resume errors * increase volume * lint * lint * catch more * add multipliers * more load, print load avg * fix sorting by prio - biggest values first * cleanup crash report for size * if test indicates its been crashing create report as well. * more threat to the machine. * timeout * fix typo * delete tzdata subdir first * use load and sockets for throttle control * install required python libs * only see for load [0, 1] * increase container version * anounce deadline at start * don't print to logfile * give better feedback if arangosh fails to launch in first place, thangs @maierlars for bringing up the topic * Update helper.linux.fish * tschuess ruby * re-sync to be stock RTA * fix container numbers, adjust #3 * sync to rta * resync * this is not needed anymore * add --fix-missing * fresh python? * revert to tar.gz * chaos tests in nightlies demand for longer timeouts, since tests run longer. * Update README.md Co-authored-by: Jan <[email protected]> * Update README.md Co-authored-by: Jan <[email protected]> * Update README.md Co-authored-by: Jan <[email protected]> * Update README.md Co-authored-by: Jan <[email protected]> * Update README.md Co-authored-by: Jan <[email protected]> * remove more old stuff * ignore encoding errors * increase timeout to hard self kill * switch to one environment variable name * env * limit the amount of coredumps * ignore access denied to open sockets * if we need to wait for the system to cool down on start... * make sure we don't come back good if nothing launched at all * them tiny boxes need more time * need more time * add deadline status to testfailurs.txt * need more time * beautify testfailures.txt * give machine estimate reasons at the start of the run * case may matter * one more environment variable Co-authored-by: Vadim <[email protected]> Co-authored-by: Jan <[email protected]>
1 parent 4c1b2a5 commit ced4533

File tree

1 file changed

+30
-18
lines changed

1 file changed

+30
-18
lines changed

Diff for: jenkins/helper/test_launch_controller.py

+30-18
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ def get_workspace():
7575
return Path.cwd() / 'work'
7676

7777
TEMP = Path("/tmp/")
78+
if 'TMP' in os.environ:
79+
TEMP = Path(os.environ['TMP'])
7880
if 'TEMP' in os.environ:
7981
TEMP = Path(os.environ['TEMP'])
8082
if 'INNERWORKDIR' in os.environ:
@@ -302,9 +304,11 @@ def __init__(self, definition_file):
302304
self.timeout = 1800
303305
if 'timeLimit'.upper() in os.environ:
304306
self.timeout = int(os.environ['timeLimit'.upper()])
307+
elif 'timeLimit' in os.environ:
308+
self.timeout = int(os.environ['timeLimit'])
305309
if psutil.cpu_count() <= 8:
306-
print("Small machine detected, trippling deadline!")
307-
self.timeout *= 3
310+
print("Small machine detected, quadrupling deadline!")
311+
self.timeout *= 4
308312
self.deadline = datetime.now() + timedelta(seconds=self.timeout)
309313
self.hard_deadline = datetime.now() + timedelta(seconds=self.timeout + 660)
310314
if definition_file.is_file():
@@ -315,7 +319,21 @@ def __init__(self, definition_file):
315319
for target in ['RelWithdebInfo', 'Debug']:
316320
if (bin_dir / target).exists():
317321
bin_dir = bin_dir / target
318-
322+
self.no_threads = psutil.cpu_count()
323+
self.available_slots = round(self.no_threads * 2) #logical=False)
324+
if IS_WINDOWS:
325+
self.max_load = 0.85
326+
self.max_load1 = 0.75
327+
else:
328+
self.max_load = self.no_threads * 0.9
329+
self.max_load1 = self.no_threads * 0.9
330+
# self.available_slots += (psutil.cpu_count(logical=True) - self.available_slots) / 2
331+
print(f"""Machine Info:
332+
- {psutil.cpu_count(logical=False)} Cores / {psutil.cpu_count(logical=True)} Threads
333+
- {psutil.virtual_memory()} virtual Memory
334+
- {self.max_load} / {self.max_load1} configured maximum load 0 / 1
335+
- {self.available_slots} test slots
336+
""")
319337
self.cfgdir = base_source_dir / 'etc' / 'relative'
320338
self.bin_dir = bin_dir
321339
self.base_path = base_source_dir
@@ -396,15 +414,6 @@ def __init__(self, cfg):
396414
self.cfg = cfg
397415
self.deadline_reached = False
398416
self.slot_lock = Lock()
399-
self.no_threads = psutil.cpu_count()
400-
self.available_slots = round(self.no_threads * 2) #logical=False)
401-
if IS_WINDOWS:
402-
self.max_load = 0.85
403-
self.max_load1 = 0.75
404-
else:
405-
self.max_load = self.no_threads * 0.9
406-
self.max_load1 = self.no_threads * 0.9
407-
# self.available_slots += (psutil.cpu_count(logical=True) - self.available_slots) / 2
408417
self.used_slots = 0
409418
self.scenarios = []
410419
self.arangosh = ArangoshExecutor(self.cfg, self.slot_lock)
@@ -429,7 +438,7 @@ def done_job(self, parallelity):
429438

430439
def launch_next(self, offset, counter):
431440
""" launch one testing job """
432-
if self.scenarios[offset].parallelity > (self.available_slots - self.used_slots):
441+
if self.scenarios[offset].parallelity > (self.cfg.available_slots - self.used_slots):
433442
return False
434443
try:
435444
sock_count = get_socket_count()
@@ -439,8 +448,8 @@ def launch_next(self, offset, counter):
439448
except psutil.AccessDenied:
440449
pass
441450
load = psutil.getloadavg()
442-
if ((load[0] > self.max_load) or
443-
(load[1] > self.max_load1)):
451+
if ((load[0] > self.cfg.max_load) or
452+
(load[1] > self.cfg.max_load1)):
444453
print(F"Load to high: {str(load)} waiting before spawning more")
445454
return False
446455
with self.slot_lock:
@@ -514,6 +523,7 @@ def handle_deadline(self):
514523

515524
def testing_runner(self):
516525
""" run testing suites """
526+
# pylint: disable=too-many-branches
517527
mem = psutil.virtual_memory()
518528
os.environ['ARANGODB_OVERRIDE_DETECTED_TOTAL_MEMORY'] = str(int((mem.total * 0.8) / 9))
519529

@@ -535,8 +545,8 @@ def testing_runner(self):
535545
used_slots = 0
536546
with self.slot_lock:
537547
used_slots = self.used_slots
538-
if self.available_slots > used_slots and start_offset < len(self.scenarios):
539-
print(f"Launching more: {self.available_slots} > {used_slots} {counter}")
548+
if self.cfg.available_slots > used_slots and start_offset < len(self.scenarios):
549+
print(f"Launching more: {self.cfg.available_slots} > {used_slots} {counter}")
540550
sys.stdout.flush()
541551
if self.launch_next(start_offset, counter):
542552
start_offset += 1
@@ -573,7 +583,9 @@ def generate_report_txt(self):
573583
for testrun in self.scenarios:
574584
print(testrun)
575585
if testrun.crashed or not testrun.success:
576-
summary += testrun.summary
586+
summary += f"\n=== {testrun.name} ===\n{testrun.summary}"
587+
if testrun.finish is None:
588+
summary += f"\n=== {testrun.name} ===\nhasn't been launched at all!"
577589
print(summary)
578590
(get_workspace() / 'testfailures.txt').write_text(summary)
579591

0 commit comments

Comments
 (0)