Skip to content

Commit dcc3cd2

Browse files
authored
QA-577: add overload process logging (#612)
* add overload process logging * debug output * fix filename
1 parent 37e4cbc commit dcc3cd2

File tree

4 files changed

+110
-0
lines changed

4 files changed

+110
-0
lines changed

Diff for: jenkins/helper/launch_handler.py

+3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from traceback import print_exc
88

99
from dmesg import DmesgWatcher, dmesg_runner
10+
from overload_thread import spawn_overload_watcher_thread, shutdown_overload_watcher_thread
1011
from site_config import SiteConfig, IS_LINUX
1112
from testing_runner import TestingRunner
1213

@@ -36,6 +37,7 @@ def launch_runner(runner, create_report):
3637
dmesg_thread = Thread(target=dmesg_runner, args=[dmesg], name="dmesg")
3738
dmesg.name = "dmesg"
3839
dmesg_thread.start()
40+
spawn_overload_watcher_thread(runner.cfg)
3941
time.sleep(3)
4042
print(runner.scenarios)
4143
try:
@@ -59,6 +61,7 @@ def launch_runner(runner, create_report):
5961
runner.create_testruns_file()
6062
if IS_LINUX:
6163
dmesg.end_run()
64+
shutdown_overload_watcher_thread()
6265
print('joining dmesg threads')
6366
dmesg_thread.join()
6467
runner.print_and_exit_closing_stance()

Diff for: jenkins/helper/overload_thread.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/bin/env python3
2+
""" check for resource shortage of the test host """
3+
# pylint: disable=global-statement disable=global-variable-not-assigned
4+
from threading import Thread, Lock
5+
import time
6+
from datetime import datetime
7+
import psutil
8+
# from tools.socket_counter import get_socket_count
9+
from tools.killall import get_all_processes_stats_json
10+
11+
END_THREAD_LOCK = Lock()
12+
END_THREAD = False
13+
OVERLOAD_THREAD = None
14+
15+
16+
def overload_thread(sitecfg, _):
17+
"""watcher thread to track system load"""
18+
continue_running = True
19+
print("starting load monitoring thread")
20+
fn =sitecfg.basedir / "overloads.jsonl"
21+
print(f"report file: {str(fn)}")
22+
with open(fn, "w+", encoding="utf-8") as jsonl_file:
23+
while continue_running:
24+
#try:
25+
# sock_count = get_socket_count()
26+
# if sock_count > 8000:
27+
# print(f"Socket count high: {sock_count}")
28+
#except psutil.AccessDenied:
29+
# pass
30+
load = psutil.getloadavg()
31+
if (load[0] > sitecfg.max_load) or (load[1] > sitecfg.max_load1) or (load[0] > sitecfg.overload):
32+
#print(f"{str(load)} <= {sitecfg.overload} Load to high - Disk I/O: " + str(psutil.swap_memory()))
33+
jsonl_file.write(f'["{datetime.now ()}", {get_all_processes_stats_json()}]\n')
34+
time.sleep(1)
35+
with END_THREAD_LOCK:
36+
continue_running = not END_THREAD
37+
#print("exiting load monitoring thread")
38+
39+
40+
def spawn_overload_watcher_thread(siteconfig):
41+
"""launch the overload watcher thread"""
42+
global OVERLOAD_THREAD
43+
OVERLOAD_THREAD = Thread(target=overload_thread, args=(siteconfig, True))
44+
OVERLOAD_THREAD.start()
45+
46+
47+
def shutdown_overload_watcher_thread():
48+
"""terminate the overload watcher thread"""
49+
global END_THREAD
50+
with END_THREAD_LOCK:
51+
END_THREAD = True
52+
if OVERLOAD_THREAD is not None:
53+
OVERLOAD_THREAD.join()

Diff for: jenkins/helper/site_config.py

+1
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ def __init__(self, definition_file):
117117
# pylint: disable=too-many-statements disable=too-many-branches
118118
print_env()
119119
init_temp()
120+
self.basedir = Path.cwd()
120121
self.datetime_format = "%Y-%m-%dT%H%M%SZ"
121122
self.trace = False
122123
self.portbase = 7000

Diff for: jenkins/helper/tools/killall.py

+53
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/bin/env python3
22
""" manipulate processes """
3+
import time
4+
import json
35
import sys
46
import psutil
57

@@ -41,3 +43,54 @@ def kill_all_arango_processes():
4143
process.kill()
4244
except psutil.NoSuchProcess: # pragma: no cover
4345
pass
46+
47+
def gather_process_thread_statistics(p):
48+
""" gather the statistics of one process and all its threads """
49+
ret = {}
50+
ret['process'] = [{
51+
'time': time.ctime(),
52+
'pid': p.pid,
53+
'name': p.name(),
54+
'percent': p.cpu_percent(),
55+
'iocounters': p.io_counters(),
56+
'ctxSwitches': p.num_ctx_switches(),
57+
'numfds': p.num_fds(),
58+
'cpu_times': p.cpu_times(),
59+
'meminfo': p.memory_full_info(),
60+
'netcons': p.connections()
61+
}]
62+
for t in p.threads():
63+
ret[ t.id ] = { 'user': t.user_time, 'sys': t.system_time}
64+
return ret
65+
66+
def add_delta(p1, p2):
67+
""" calculate and add a delta in cpu and time to all threads of a process """
68+
tids = list(p1.keys())
69+
for tid in tids:
70+
if tid in p2 and tid != 'process':
71+
p1[tid]['d_user'] = p2[tid]['user'] - p1[tid]['user']
72+
p1[tid]['d_sys'] = p2[tid]['sys'] - p1[tid]['sys']
73+
p1['process'].append(p2['process'][0])
74+
75+
def get_all_processes_stats_json():
76+
""" aggregate a structure of all processes and their threads plus delta """
77+
process_full_list = {}
78+
for n in [True, False]:
79+
processes = psutil.process_iter()
80+
for process in processes:
81+
name = ""
82+
try:
83+
name = process.name()
84+
if process.ppid() != 2 and process.pid not in [1, 2]:
85+
procstat = gather_process_thread_statistics(process)
86+
if n:
87+
process_full_list[f"p{process.pid}"] = procstat
88+
else:
89+
add_delta(process_full_list[f"p{process.pid}"], procstat)
90+
except psutil.AccessDenied:
91+
pass
92+
except Exception as ex:
93+
print(f"while inspecting {name}: {ex} ")
94+
if n:
95+
time.sleep(1)
96+
return json.dumps(process_full_list)

0 commit comments

Comments
 (0)