-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path00_8_add_random_missing_values.py
73 lines (60 loc) · 1.98 KB
/
00_8_add_random_missing_values.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# %% [markdown]
# # Add MAR to dataset
# Add missing at randomm (MAR) to dataset
# %%
from pathlib import Path
from typing import Optional, Union
import pandas as pd
import pimmslearn.nb
# %%
# catch passed parameters
args = None
args = dict(globals()).keys()
# %%
# Sample (rows) intensiites for features (columns)
fn_intensities: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'
index_col: Optional[Union[tuple, str]] = 0
# column index name, e.g. Protein Groups, peptides, etc.
col_name: Optional[str] = None
folder_experiment: str = f'runs/example'
sample_frac: float = .8 # fraction of intensities to keep
random_state: int = 42 # random state for reproducibility
folder_data: str = '' # specify data directory if needed
file_format: str = 'csv' # file format of create splits, default pickle (pkl)
out_root: Optional[str] = None # specify output folder if needed
fn_intensities = "data/ALD_study/processed/ald_plasma_proteinGroups.pkl"
# %%
fn_intensities = Path(fn_intensities)
if not out_root:
out_root = fn_intensities.parent
args = pimmslearn.nb.get_params(args, globals=globals())
args = pimmslearn.nb.args_from_dict(args)
args
# %%
file_format = args.fn_intensities.suffix
# ! Add check if knonw file format
FILE_FORMAT_TO_CONSTRUCTOR = {'.csv': 'read_csv',
'.pkl': 'read_pickle',
'.pickle': 'read_pickle',
}
load_fct = getattr(pd, FILE_FORMAT_TO_CONSTRUCTOR[file_format])
try:
df = load_fct(args.fn_intensities, index_col=args.index_col)
except TypeError:
df = load_fct(args.fn_intensities)
df
# %%
if args.col_name:
df.columns.name = args.col_name
# %%
sampled = df.stack().sample(frac=args.sample_frac,
random_state=args.random_state).unstack()
sampled
# %%
fname_out = args.out_folder / "{stem}_{frac:0.2f}.pkl".format(
stem=args.fn_intensities.stem,
frac=args.sample_frac)
fname_out.as_posix()
# %%
sampled.to_pickle(fname_out)
# %%