Skip to content

Pace algorithm implementation for irregular data #669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 12 commits into
base: develop
Choose a base branch
from
28 changes: 28 additions & 0 deletions docs/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,20 @@ @misc{srivastava++_2011_registration
keywords = {Mathematics - Statistics Theory,Statistics - Applications,Statistics - Methodology}
}

@article{staniswalis+lee_1998_nonparametric_regression,
author = {Staniswalis, Joan G. and Lee, J. Jack},
title = {Nonparametric Regression Analysis of Longitudinal Data},
journal = {Journal of the American Statistical Association},
volume = {93},
number = {444},
pages = {1403--1418},
year = {1998},
publisher = {{ASA Website}},
doi = {10.1080/01621459.1998.10473801},
url = {https://www.tandfonline.com/doi/abs/10.1080/01621459.1998.10473801},
eprint = {https://www.tandfonline.com/doi/pdf/10.1080/01621459.1998.10473801}
}

@article{sun+genton_2011_functional,
title = {Functional Boxplots},
author = {Sun, Ying and Genton, Marc G.},
Expand Down Expand Up @@ -636,3 +650,17 @@ @book{wasserman_2006
isbn = {978-0-387-25145-5},
langid = {english}
}

@article{yao+muller+wang_2005_pace,
author = {Yao, Fang and M{\"u}ller, Hans-Georg and Wang, Jane-Ling},
title = {Functional Data Analysis for Sparse Longitudinal Data},
journal = {Journal of the American Statistical Association},
volume = {100},
number = {470},
pages = {577--590},
year = {2005},
publisher = {{ASA Website}},
doi = {10.1198/016214504000001745},
url = {https://doi.org/10.1198/016214504000001745},
eprint = {https://doi.org/10.1198/016214504000001745}
}
81 changes: 78 additions & 3 deletions skfda/datasets/_real_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
from sklearn.utils import Bunch
from typing_extensions import Literal

from ..representation import FDataGrid
from ..representation.irregular import FDataIrregular
from ..typing._numpy import NDArrayFloat, NDArrayInt
from skfda.representation import FDataGrid
from skfda.representation.irregular import FDataIrregular
from skfda.typing._numpy import NDArrayFloat, NDArrayInt


def fdata_constructor(
Expand Down Expand Up @@ -1644,3 +1644,78 @@ def fetch_bone_density(
target_names=target_names,
DESCR=descr,
)


_cd4_descr = """
CD4 cell counts for 366 subjects between months -18 and 42 since
seroconversion. Each subject's observations are contained in a single row.

Format: A data frame made up of a 366 x 61 matrix of CD4 cell counts.

The data is obtained from the R package 'refund' from CRAN.

Source:
https://cran.r-project.org/web/packages/refund/index.html
Goldsmith, J., Greven, S., and Crainiceanu, C. (2013). Corrected
confidence bands for functional data using principal components.
Biometrics, 69(1), 41-51.
"""


def fetch_cd4(
return_X_y: bool = False,
as_frame: bool = False,
) -> Bunch | Tuple[FDataIrregular, None] | Tuple[DataFrame, None]:
"""
Load the CD4 cell counts dataset. This is an irregular dataset.

Rows contain one curve per subject.

The data is obtained from the R package 'refund'.
"""
descr = _cd4_descr
raw_dataset = fetch_cran("cd4", "refund")
cd4_array = raw_dataset["cd4"]

grid_points = cd4_array.coords["dim_1"].to_numpy().astype(float)
data_matrix = cd4_array.to_numpy().astype(float)

cd4_grid = FDataGrid(
data_matrix=data_matrix,
grid_points=grid_points,
)

curves = FDataIrregular.from_fdatagrid(
cd4_grid,
dataset_name="cd4",
argument_names=["month"],
coordinate_names=["CD4 count"],
)

frame = None

if as_frame:
cd4_df = pd.DataFrame(cd4_array.values)
cd4_df.columns = list(cd4_array.coords["dim_1"].values)
cd4_df.insert(0, "id", range(len(cd4_df))) # Add ID for each row

cd4_df_long = cd4_df.melt(id_vars="id", var_name="time", value_name="cd4_count")
cd4_df_long = cd4_df_long.dropna()

cd4_df_long["time"] = cd4_df_long["time"].astype(float)
cd4_df_long = cd4_df_long.sort_values(by=["id", "time"])
curves = cd4_df_long.reset_index(drop=True)
frame = curves

if return_X_y:
return curves, None

return Bunch(
data=curves,
target=None,
frame=frame,
categories={},
feature_names=["cd4"],
target_names=[],
DESCR=descr,
)
2 changes: 2 additions & 0 deletions skfda/preprocessing/dim_reduction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"_fpca": ["FPCA"],
"_fpls": ["FPLS"],
"_neighbor_transforms": ["KNeighborsTransformer"],
"_pace": ["PACE"],
},
)

Expand All @@ -26,6 +27,7 @@
from ._neighbor_transforms import (
KNeighborsTransformer as KNeighborsTransformer,
)
from ._pace import PACE as PACE


def __getattr__(name: str) -> Any:
Expand Down
Loading
Loading