# Imputation using random draws from shifted normal distribution

In [None]:
import logging

import pandas as pd
from IPython.display import display

import pimmslearn
import pimmslearn.imputation
import pimmslearn.model
import pimmslearn.models as models
import pimmslearn.nb
from pimmslearn.io import datasplits

logger = pimmslearn.logging.setup_logger(logging.getLogger('pimmslearn'))
logger.info("Median Imputation")

figures = {}  # collection of ax or figures

In [None]:
# catch passed parameters
args = None
args = dict(globals()).keys()

Papermill script parameters:

In [None]:
# files and folders
# Datasplit folder with data for experiment
folder_experiment: str = 'runs/example'
file_format: str = 'csv'  # file format of create splits, default pickle (pkl)
# Machine parsed metadata from rawfile workflow
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
# model
sample_idx_position: int = 0  # position of index which is sample ID
# model key (lower cased version will be used for file names)
axis: int = 1  # impute per row/sample (1) or per column/feat (0).
completeness = 0.6  # fractio of non missing values for row/sample (axis=0) or column/feat (axis=1)
model_key: str = 'RSN'
model: str = 'RSN'  # model name
save_pred_real_na: bool = True  # Save all predictions for real na
# metadata -> defaults for metadata extracted from machine data
meta_date_col: str = None  # date column in meta data
meta_cat_col: str = None  # category column in meta data

Some argument transformations

In [None]:
args = pimmslearn.nb.get_params(args, globals=globals())
args

In [None]:
args = pimmslearn.nb.args_from_dict(args)
args

Some naming conventions

In [None]:
TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'

## Load data in long format

In [None]:
data = datasplits.DataSplits.from_folder(
    args.data, file_format=args.file_format)

data is loaded in long format

In [None]:
data.train_X.sample(5)

Infer index names from long format

In [None]:
index_columns = list(data.train_X.index.names)
sample_id = index_columns.pop(args.sample_idx_position)
if len(index_columns) == 1:
    index_column = index_columns.pop()
    index_columns = None
    logger.info(f"{sample_id = }, single feature: {index_column = }")
else:
    logger.info(f"{sample_id = }, multiple features: {index_columns = }")

if not index_columns:
    index_columns = [sample_id, index_column]
else:
    raise NotImplementedError(
        "More than one feature: Needs to be implemented. see above logging output.")

load meta data for splits

In [None]:
if args.fn_rawfile_metadata:
    df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0)
    display(df_meta.loc[data.train_X.index.levels[0]])
else:
    df_meta = None

## Initialize Comparison


In [None]:
freq_feat = pimmslearn.io.datasplits.load_freq(args.data)
freq_feat.head()  # training data

### Produce some addional fake samples

The validation simulated NA is used to by all models to evaluate training performance.

In [None]:
val_pred_fake_na = data.val_y.to_frame(name='observed')
val_pred_fake_na

In [None]:
test_pred_fake_na = data.test_y.to_frame(name='observed')
test_pred_fake_na.describe()

## Data in wide format

In [None]:
data.to_wide_format()
args.M = data.train_X.shape[-1]
data.train_X.head()

### Impute using shifted normal distribution

In [None]:
imputed_shifted_normal = pimmslearn.imputation.impute_shifted_normal(
    data.train_X,
    mean_shift=1.8,
    std_shrinkage=0.3,
    completeness=args.completeness,
    axis=args.axis)
imputed_shifted_normal = imputed_shifted_normal.to_frame('intensity')
imputed_shifted_normal

In [None]:
val_pred_fake_na[args.model] = imputed_shifted_normal
test_pred_fake_na[args.model] = imputed_shifted_normal
val_pred_fake_na

Save predictions for NA

In [None]:
if args.save_pred_real_na:
    mask = data.train_X.isna().stack()
    idx_real_na = mask.index[mask]
    idx_real_na = (idx_real_na
                   .drop(val_pred_fake_na.index)
                   .drop(test_pred_fake_na.index))
    # hacky, but works:
    pred_real_na = (pd.Series(0, index=idx_real_na, name='placeholder')
                    .to_frame()
                    .join(imputed_shifted_normal)
                    .drop('placeholder', axis=1))
    # pred_real_na.name = 'intensity'
    display(pred_real_na)
    pred_real_na.to_csv(args.out_preds / f"pred_real_na_{args.model_key}.csv")


# # %% [markdown]
# ### Plots
#

In [None]:
ax, _ = pimmslearn.plotting.errors.plot_errors_binned(val_pred_fake_na)

In [None]:
ax, _ = pimmslearn.plotting.errors.plot_errors_binned(test_pred_fake_na)

## Comparisons

### Validation data

- all measured (identified, observed) peptides in validation data

In [None]:
# papermill_description=metrics
d_metrics = models.Metrics()

The fake NA for the validation step are real test data (not used for training nor early stopping)

In [None]:
added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')
added_metrics

### Test Datasplit

Fake NAs : Artificially created NAs. Some data was sampled and set
explicitly to misssing before it was fed to the model for
reconstruction.

In [None]:
added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')
added_metrics

The fake NA for the validation step are real test data

### Save all metrics as json

In [None]:
pimmslearn.io.dump_json(d_metrics.metrics, args.out_metrics /
                        f'metrics_{args.model_key}.json')
d_metrics

In [None]:
metrics_df = models.get_df_from_nested_dict(
    d_metrics.metrics, column_levels=['model', 'metric_name']).T
metrics_df

## Save predictions

In [None]:
# val
fname = args.out_preds / f"pred_val_{args.model_key}.csv"
setattr(args, fname.stem, fname.as_posix())  # add [] assignment?
val_pred_fake_na.to_csv(fname)
# test
fname = args.out_preds / f"pred_test_{args.model_key}.csv"
setattr(args, fname.stem, fname.as_posix())
test_pred_fake_na.to_csv(fname)

## Config

In [None]:
figures  # switch to fnames?

In [None]:
args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml")
args