-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path03_6_setup_comparison_rev3.py
109 lines (91 loc) · 2.9 KB
/
03_6_setup_comparison_rev3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# %% [markdown]
# # Compare setup of different samling strategies of simulated data
#
# 1. sampling from all samples
# 2. sampling from subset of samples
# %%
import logging
from pathlib import Path
import pandas as pd
import pimmslearn.nb
import pimmslearn.pandas
import pimmslearn.plotting
from pimmslearn.logging import setup_logger
logger = setup_logger(logger=logging.getLogger('pimmslearn'), level=10)
# %%
# parameters
FOLDER = Path('runs/appl_ald_data_rev3/plasma/')
files_in = {'All': 'runs/appl_ald_data_2023_11/plasma/proteinGroups/01_2_performance_summary.xlsx',
'Subset': FOLDER / 'proteinGroups_subset/01_2_performance_summary.xlsx'
}
pred_in = {'All': 'runs/appl_ald_data_2023_11/plasma/proteinGroups/01_2_agg_pred_test.csv',
'Subset': FOLDER / 'proteinGroups_subset/01_2_agg_pred_test.csv'
}
# %%
fname = FOLDER / 'comparison.xlsx'
print(f"{fname = }")
writer = pd.ExcelWriter(fname)
# %%
cp_all = list()
for key, file_in in files_in.items():
_ = (pd.read_excel(file_in, index_col=0, sheet_name='mae_stats_ordered_test').iloc[:3]
.T
.dropna()
.astype({'count': int})
)
_.columns = pd.MultiIndex.from_tuples((key, k) for k in _.columns)
cp_all.append(_)
cp_all = pd.concat(cp_all, axis=1)
cp_all
# %%
cp_all.to_excel(writer, sheet_name='all')
# %%
pred = list()
for key, file_in in pred_in.items():
_ = (pd.read_csv(file_in, index_col=[0, 1])
).dropna(axis=1, how='all')
_ = pimmslearn.pandas.calc_errors.get_absolute_error(_)
_.columns = pd.MultiIndex.from_tuples((key, k) for k in _.columns)
pred.append(_)
pred = pd.concat(pred, axis=1)
pred
# %%
cp_top6_subset = (pred
.loc[:,
pd.IndexSlice[['All', 'Subset'],
['DAE', 'TRKNN', 'CF', 'RF', 'VAE', 'Median']]]
.dropna()
.describe()
.iloc[:3]
.T
.astype({'count': int})
.unstack(0)
.swaplevel(0, 1, axis=1)
.loc[:, pd.IndexSlice[['All', 'Subset'], ['count', 'mean', 'std']]]
)
cp_top6_subset
# %% [markdown]
# get indices of 1,086 protein groups which are shared between the two setups
# %%
idx_shared = (pred
.loc[:,
pd.IndexSlice[['All', 'Subset'],
['DAE', 'TRKNN', 'CF', 'RF', 'VAE', 'Median']]]
.dropna()).index
# %%
cp_subset = (pred
.loc[idx_shared]
.describe()
.iloc[:3]
.T
.astype({'count': int})
.unstack(0)
.swaplevel(0, 1, axis=1)
.loc[:, pd.IndexSlice[['All', 'Subset'], ['count', 'mean', 'std']]]
).loc[cp_all.index]
cp_subset
# %%
cp_subset.to_excel(writer, sheet_name='subset')
# %%
print(f"{fname = }")
writer.close()