Skip to content

Commit de88a1f

Browse files
authored
QA-585: harvest resource logs (#614)
* switch location * have commandline to identify scripts * log ppid * write a second jsonl file decyphering logfile -> testing.js PID * write a second jsonl file decyphering logfile -> testing.js PID * write a second jsonl file decyphering logfile -> testing.js PID * write system information as well * write system information as well * write a second jsonl file decyphering logfile -> testing.js PID * fix file mode * resort, move into function, disable cleanup of lower layers * disable delta measuring, we do that while evaluating the results
1 parent e6fce8c commit de88a1f

File tree

5 files changed

+69
-63
lines changed

5 files changed

+69
-63
lines changed

Diff for: jenkins/helper/aggregate_coverage.py

+38-41
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def launch(self):
183183
Path(self.job[1]).rename(Path(self.params['output']))
184184
else:
185185
print("none of our files found in the error message!")
186-
else:
186+
elif self.job[2]:
187187
for one_file in [self.job[0], self.job[1]]:
188188
print('cleaning up')
189189
cleanup_file = Path(one_file)
@@ -196,6 +196,9 @@ def launch(self):
196196
print('file gone')
197197
else:
198198
print(f'file {str(cleanup_file)} already gone?')
199+
print(f"skipping {self.job[0]}")
200+
else:
201+
print(f"skipping this layer with {self.job[0]} {self.job[1]}")
199202
print(f"launch(): returning {ret}")
200203
return ret
201204

@@ -279,14 +282,16 @@ def combine_coverage_dirs_multi(cfg,
279282
if len(sub_jobs) == 1:
280283
print(sub_jobs)
281284
return sub_jobs[0]
285+
layer = 0
282286
while len(sub_jobs) > 1:
283287
next_jobs = []
284288
jobs.append([])
285289
while len(sub_jobs) > 1:
286290
last_output = combined_dir / f'{jobcount}'
287291
this_subjob = [str(sub_jobs.pop()),
288292
str(sub_jobs.pop()),
289-
str(last_output)]
293+
str(last_output),
294+
count < 5]
290295
jobs[count].append(this_subjob)
291296
next_jobs.append(this_subjob[2])
292297
jobcount += 1
@@ -346,39 +351,8 @@ def convert_lcov_to_cobertura(cfg, lcov_file, source_dir, binary, cobertura_xml,
346351
cov = LcovCobertura(cfg)
347352
cov.launch(lcov_file, source_dir, binary, cobertura_xml, excludes)
348353

349-
def main():
350-
""" go """
351-
# pylint disable=too-many-locals disable=too-many-statements
352-
base_dir = Path(sys.argv[1])
353-
os.chdir(base_dir)
354-
coverage_dir = base_dir / 'coverage'
355-
if coverage_dir.exists():
356-
shutil.rmtree(str(coverage_dir))
357-
coverage_dir.mkdir()
358-
gcov_dir = base_dir / sys.argv[2]
359-
#try:
360-
# shutil.make_archive("/work/testresults2124",
361-
# 'tar.gz',
362-
# "/work/gcov",
363-
# "/work/gcov",
364-
# True)
365-
#except Exception as ex:
366-
# print(f"Failed to create zip: {str(ex)}")
367-
cfg = SiteConfig(gcov_dir.resolve())
368-
#import glob
369-
#for filename in glob.iglob('/work/gcov**/**', recursive=True):
370-
# print(filename)
371-
result_dir = combine_coverage_dirs_multi(
372-
cfg,
373-
gcov_dir,
374-
psutil.cpu_count(logical=False))
375-
376-
sourcedir = base_dir / 'ArangoDB'
377-
binary = sourcedir / 'build' / 'bin' / 'arangod'
378-
lcov_file = gcov_dir / 'coverage.lcov'
379-
print('converting to lcov file')
380-
convert_to_lcov_file(cfg, result_dir, lcov_file)
381-
# copy the source files from the sourcecode directory
354+
def copy_source_directory(sourcedir, coverage_dir):
355+
""" copy the source files from the sourcecode directory """
382356
for copy_dir in [
383357
Path('lib'),
384358
Path('arangosh'),
@@ -401,7 +375,36 @@ def main():
401375
for filename in files:
402376
source = os.path.join(root, filename)
403377
shutil.copy2(source, path / filename)
378+
print('create a symlink into the jemalloc source:')
379+
jmdir = sourcedir / '3rdParty' / 'jemalloc' / 'jemalloc' / 'include'
380+
if not jmdir.exists():
381+
jmdir = list((sourcedir / '3rdParty' / 'jemalloc').glob('v*'))[0] / 'include'
382+
(sourcedir / 'include').symlink_to(jmdir)
404383

384+
def main():
385+
""" go """
386+
# pylint disable=too-many-locals disable=too-many-statements
387+
base_dir = Path(sys.argv[1])
388+
os.chdir(base_dir)
389+
coverage_dir = base_dir / 'coverage'
390+
if coverage_dir.exists():
391+
shutil.rmtree(str(coverage_dir))
392+
coverage_dir.mkdir()
393+
gcov_dir = base_dir / sys.argv[2]
394+
cfg = SiteConfig(gcov_dir.resolve())
395+
result_dir = combine_coverage_dirs_multi(
396+
cfg,
397+
gcov_dir,
398+
psutil.cpu_count(logical=False))
399+
400+
sourcedir = base_dir / 'ArangoDB'
401+
binary = sourcedir / 'build' / 'bin' / 'arangod'
402+
lcov_file = gcov_dir / 'coverage.lcov'
403+
404+
copy_source_directory(sourcedir, coverage_dir)
405+
406+
print('converting to lcov file')
407+
convert_to_lcov_file(cfg, result_dir, lcov_file)
405408
print('copy the gcno files from the build directory')
406409
buildir = sourcedir / 'build'
407410
baselen = len(str(buildir))
@@ -413,12 +416,6 @@ def main():
413416
source = os.path.join(root, filename)
414417
shutil.copy2(source, path / filename)
415418

416-
print('create a symlink into the jemalloc source:')
417-
jmdir = sourcedir / '3rdParty' / 'jemalloc' / 'jemalloc' / 'include'
418-
if not jmdir.exists():
419-
jmdir = list((sourcedir / '3rdParty' / 'jemalloc').glob('v*'))[0] / 'include'
420-
(sourcedir / 'include').symlink_to(jmdir)
421-
422419
cobertura_xml = coverage_dir / 'coverage.xml'
423420
print('converting to cobertura report')
424421
convert_lcov_to_cobertura(cfg, lcov_file,

Diff for: jenkins/helper/arangosh.py

+1
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,5 @@ def run_testing(self,
8080
)
8181
delete_logfile_params(params)
8282
ret['error'] = params['error']
83+
ret['pid'] = params['pid']
8384
return ret

Diff for: jenkins/helper/overload_thread.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def overload_thread(sitecfg, _):
1717
"""watcher thread to track system load"""
1818
continue_running = True
1919
print("starting load monitoring thread")
20-
fn =sitecfg.basedir / "overloads.jsonl"
20+
fn =sitecfg.run_root / "overloads.jsonl"
2121
print(f"report file: {str(fn)}")
2222
with open(fn, "w+", encoding="utf-8") as jsonl_file:
2323
while continue_running:
@@ -30,7 +30,7 @@ def overload_thread(sitecfg, _):
3030
load = psutil.getloadavg()
3131
if (load[0] > sitecfg.max_load) or (load[1] > sitecfg.max_load1) or (load[0] > sitecfg.overload):
3232
#print(f"{str(load)} <= {sitecfg.overload} Load to high - Disk I/O: " + str(psutil.swap_memory()))
33-
jsonl_file.write(f'["{datetime.now ()}", {get_all_processes_stats_json()}]\n')
33+
jsonl_file.write(f'["{datetime.now ()}", {get_all_processes_stats_json(load)}]\n')
3434
time.sleep(1)
3535
with END_THREAD_LOCK:
3636
continue_running = not END_THREAD

Diff for: jenkins/helper/testing_runner.py

+3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
from pathlib import Path
66
import hashlib
7+
import json
78
import pprint
89
import re
910
import shutil
@@ -168,6 +169,8 @@ def testing_runner(testing_instance, this, arangosh):
168169
print(traceback.format_exc())
169170
raise ex
170171
with arangosh.slot_lock:
172+
with open((this.cfg.run_root / "job_to_pids.jsonl"), "a+", encoding="utf-8") as jsonl_file:
173+
jsonl_file.write(f'{json.dumps({"pid": ret["pid"], "logfile": str(this.log_file)})}\n')
171174
testing_instance.running_suites.remove(this.name_enum)
172175
testing_instance.done_job(this.parallelity)
173176

Diff for: jenkins/helper/tools/killall.py

+25-20
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ def list_all_processes():
1010
pseaf = "PID Process"
1111
# pylint: disable=catching-non-exception
1212
for process in psutil.process_iter(["pid", "ppid", "name"]):
13+
if process.pid in [1, 2] or process.ppid() == 2:
14+
continue
1315
cmdline = process.name
1416
try:
1517
cmdline = str(process.cmdline())
@@ -27,7 +29,6 @@ def list_all_processes():
2729

2830
def kill_all_arango_processes():
2931
"""list all processes for later reference"""
30-
pseaf = "PID Process"
3132
# pylint: disable=catching-non-exception
3233
for process in psutil.process_iter(["pid", "name"]):
3334
if (process.name().lower().find('arango') >= 0 or
@@ -50,6 +51,8 @@ def gather_process_thread_statistics(p):
5051
ret['process'] = [{
5152
'time': time.ctime(),
5253
'pid': p.pid,
54+
'ppid': p.ppid(),
55+
'cmdline': p.cmdline(),
5356
'name': p.name(),
5457
'percent': p.cpu_percent(),
5558
'iocounters': p.io_counters(),
@@ -72,25 +75,27 @@ def add_delta(p1, p2):
7275
p1[tid]['d_sys'] = p2[tid]['sys'] - p1[tid]['sys']
7376
p1['process'].append(p2['process'][0])
7477

75-
def get_all_processes_stats_json():
78+
def get_all_processes_stats_json(load):
7679
""" aggregate a structure of all processes and their threads plus delta """
80+
# pylint: disable=broad-exception-caught
7781
process_full_list = {}
78-
for n in [True, False]:
79-
processes = psutil.process_iter()
80-
for process in processes:
81-
name = ""
82-
try:
83-
name = process.name()
84-
if process.ppid() != 2 and process.pid not in [1, 2]:
85-
procstat = gather_process_thread_statistics(process)
86-
if n:
87-
process_full_list[f"p{process.pid}"] = procstat
88-
else:
89-
add_delta(process_full_list[f"p{process.pid}"], procstat)
90-
except psutil.AccessDenied:
91-
pass
92-
except Exception as ex:
93-
print(f"while inspecting {name}: {ex} ")
94-
if n:
95-
time.sleep(1)
82+
process_full_list['sys'] = {
83+
'load': load,
84+
'vmem': psutil.virtual_memory(),
85+
'mem': psutil.swap_memory(),
86+
'diskio': psutil.disk_io_counters(perdisk=True, nowrap=True),
87+
'netio': psutil.net_io_counters(pernic=True, nowrap=True),
88+
}
89+
processes = psutil.process_iter()
90+
for process in processes:
91+
name = ""
92+
try:
93+
name = process.name()
94+
if process.pid not in [1, 2] and process.ppid() != 2:
95+
procstat = gather_process_thread_statistics(process)
96+
process_full_list[f"p{process.pid}"] = procstat
97+
except psutil.AccessDenied:
98+
pass
99+
except Exception as ex:
100+
print(f"while inspecting {name}: {ex} ")
96101
return json.dumps(process_full_list)

0 commit comments

Comments
 (0)