MRG Deprecates 'normalize' in LinearRegression (_base.py) (scikit-learn#17743)

maikia · glemaitre · NicolasHug · web-flow · commit 306826f7b6bf · 2021-01-22T14:57:23.000+01:00
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
Co-authored-by: Nicolas Hug &lt;contact@nicolas-hug.com&gt;
Co-authored-by: Alexandre Gramfort &lt;alexandre.gramfort@m4x.org&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -94,6 +94,18 @@ Changelog
   Use ``var_`` instead.
   :pr:`18842` by :user:`Hong Shao Yang <hongshaoyang>`.
 
+- |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression`
+  is deprecated and will be removed in 1.2.
+  Motivation for this deprecation: ``normalize`` parameter did not take any
+  effect if ``fit_intercept`` was set to False and therefore was deemed
+  confusing.
+  The behavior of the deprecated LinearRegression(normalize=True) can be
+  reproduced with :class:`~sklearn.pipeline.Pipeline` with
+  :class:`~sklearn.preprocessing.StandardScaler`as follows:
+  make_pipeline(StandardScaler(with_mean=False), LinearRegression()).
+  :pr:`17743` by :user:`Maria Telenczuk <maikia>` and
+  :user:`Alexandre Gramfort <agramfort>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -11,6 +11,7 @@
 #         Lars Buitinck
 #         Maryan Morel <maryan.morel@polytechnique.edu>
 #         Giorgio Patrini <giorgio.patrini@anu.edu.au>
+#         Maria Telenczuk <https://github.com/maikia>
 # License: BSD 3 clause
 
 from abc import ABCMeta, abstractmethod
@@ -49,6 +50,94 @@
 # intercept oscillation.
 
 
+# FIXME in 1.2: parameter 'normalize' should be removed from linear models
+# in cases where now normalize=False. The default value of 'normalize' should
+# be changed to False in linear models where now normalize=True
+def _deprecate_normalize(normalize, default, estimator_name):
+    """ Normalize is to be deprecated from linear models and a use of
+    a pipeline with a StandardScaler is to be recommended instead.
+    Here the appropriate message is selected to be displayed to the user
+    depending on the default normalize value (as it varies between the linear
+    models and normalize value selected by the user).
+
+    Parameters
+    ----------
+    normalize : bool,
+        normalize value passed by the user
+
+    default : bool,
+        default normalize value used by the estimator
+
+    estimator_name : string,
+        name of the linear estimator which calls this function.
+        The name will be used for writing the deprecation warnings
+
+    Returns
+    -------
+    normalize : bool,
+        normalize value which should further be used by the estimator at this
+        stage of the depreciation process
+
+    Notes
+    -----
+    This function should be updated in 1.2 depending on the value of
+    `normalize`:
+    - True, warning: `normalize` was deprecated in 1.2 and will be removed in
+      1.4. Suggest to use pipeline instead.
+    - False, `normalize` was deprecated in 1.2 and it will be removed in 1.4.
+      Leave normalize to its default value.
+    - `deprecated` - this should only be possible with default == False as from
+      1.2 `normalize` in all the linear models should be either removed or the
+      default should be set to False.
+    This function should be completely removed in 1.4.
+    """
+
+    if normalize not in [True, False, 'deprecated']:
+        raise ValueError("Leave 'normalize' to its default value or set it "
+                         "to True or False")
+
+    if normalize == 'deprecated':
+        _normalize = default
+    else:
+        _normalize = normalize
+
+    if default and normalize == 'deprecated':
+        warnings.warn(
+            "The default of 'normalize' will be set to False in version 1.2 "
+            "and deprecated in version 1.4. \nPass normalize=False and use "
+            "Pipeline with a StandardScaler in a preprocessing stage if you "
+            "wish to reproduce the previous behavior:\n"
+            "model = make_pipeline(StandardScaler(with_mean=False), \n"
+            f"{estimator_name}(normalize=False))\n"
+            "If you wish to use additional parameters in "
+            "the fit() you can include them as follows:\n"
+            "kwargs = {model.steps[-1][0] + "
+            "'__<your_param_name>': <your_param_value>}\n"
+            "model.fit(X, y, **kwargs)", FutureWarning
+        )
+    elif normalize != 'deprecated' and normalize and not default:
+        warnings.warn(
+            "'normalize' was deprecated in version 1.0 and will be "
+            "removed in 1.2 \nIf you still wish to normalize use "
+            "Pipeline with a StandardScaler in a preprocessing stage if you "
+            "wish to reproduce the previous behavior:\n"
+            "model = make_pipeline(StandardScaler(with_mean=False), "
+            f"{estimator_name}()). \nIf you wish to use additional "
+            "parameters in the fit() you can include them as follows: "
+            "kwargs = {model.steps[-1][0] + "
+            "'__<your_param_name>': <your_param_value>}\n"
+            "model.fit(X, y, **kwargs)", FutureWarning
+        )
+    elif not normalize and not default:
+        warnings.warn(
+            "'normalize' was deprecated in version 1.0 and will be"
+            " removed in 1.2 Don't set 'normalize' parameter"
+            " and leave it to its default value", FutureWarning
+        )
+
+    return _normalize
+
+
 def make_dataset(X, y, sample_weight, random_state=None):
     """Create ``Dataset`` abstraction for sparse and dense inputs.
 
@@ -407,6 +496,10 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+           `normalize` was deprecated in version 1.0 and will be
+           removed in 1.2.
+
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
@@ -476,8 +569,8 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     array([16.])
     """
     @_deprecate_positional_args
-    def __init__(self, *, fit_intercept=True, normalize=False, copy_X=True,
-                 n_jobs=None, positive=False):
+    def __init__(self, *, fit_intercept=True, normalize='deprecated',
+                 copy_X=True, n_jobs=None, positive=False):
         self.fit_intercept = fit_intercept
         self.normalize = normalize
         self.copy_X = copy_X
@@ -507,6 +600,11 @@ def fit(self, X, y, sample_weight=None):
         self : returns an instance of self.
         """
 
+        _normalize = _deprecate_normalize(
+            self.normalize, default=False,
+            estimator_name=self.__class__.__name__
+        )
+
         n_jobs_ = self.n_jobs
 
         accept_sparse = False if self.positive else ['csr', 'csc', 'coo']
@@ -519,7 +617,7 @@ def fit(self, X, y, sample_weight=None):
                                                  dtype=X.dtype)
 
         X, y, X_offset, y_offset, X_scale = self._preprocess_data(
-            X, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
+            X, y, fit_intercept=self.fit_intercept, normalize=_normalize,
             copy=self.copy_X, sample_weight=sample_weight,
             return_mean=True)
 
@@ -651,10 +749,12 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy,
             check_input=check_input, sample_weight=sample_weight)
     if sample_weight is not None:
         X, y = _rescale_data(X, y, sample_weight=sample_weight)
+
+    # FIXME: 'normalize' to be removed in 1.2
     if hasattr(precompute, '__array__'):
         if (fit_intercept and not np.allclose(X_offset, np.zeros(n_features))
-                or normalize and not np.allclose(X_scale,
-                                                 np.ones(n_features))):
+                or normalize and not np.allclose(X_scale, np.ones(n_features)
+                                                 )):
             warnings.warn(
                 "Gram matrix was provided but X was centered to fit "
                 "intercept, or X was normalized : recomputing Gram matrix.",
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
@@ -17,6 +17,7 @@
 from sklearn.utils.fixes import parse_version
 
 from sklearn.linear_model import LinearRegression
+from sklearn.linear_model._base import _deprecate_normalize
 from sklearn.linear_model._base import _preprocess_data
 from sklearn.linear_model._base import _rescale_data
 from sklearn.linear_model._base import make_dataset
@@ -106,6 +107,7 @@ def test_raises_value_error_if_positive_and_sparse():
     with pytest.raises(TypeError, match=error_msg):
         reg.fit(X, y)
 
+
 def test_raises_value_error_if_sample_weights_greater_than_1d():
     # Sample weights must be either scalar or 1D
 
@@ -149,6 +151,59 @@ def test_fit_intercept():
             lr3_without_intercept.coef_.ndim)
 
 
+def test_error_on_wrong_normalize():
+    normalize = 'wrong'
+    default = True
+    error_msg = "Leave 'normalize' to its default"
+    with pytest.raises(ValueError, match=error_msg):
+        _deprecate_normalize(normalize, default, 'estimator')
+    ValueError
+
+
+@pytest.mark.parametrize('normalize', [True, False, 'deprecated'])
+@pytest.mark.parametrize('default', [True, False])
+# FIXME update test in 1.2 for new versions
+def test_deprecate_normalize(normalize, default):
+    # test all possible case of the normalize parameter deprecation
+    if not default:
+        if normalize == 'deprecated':
+            # no warning
+            output = default
+            expected = None
+            warning_msg = []
+        else:
+            output = normalize
+            expected = FutureWarning
+            warning_msg = ['1.2']
+            if not normalize:
+                warning_msg.append('default value')
+            else:
+                warning_msg.append('StandardScaler(')
+    elif default:
+        if normalize == 'deprecated':
+            # warning to pass False and use StandardScaler
+            output = default
+            expected = FutureWarning
+            warning_msg = ['False', '1.2', 'StandardScaler(']
+        else:
+            # no warning
+            output = normalize
+            expected = None
+            warning_msg = []
+
+    with pytest.warns(expected) as record:
+        _normalize = _deprecate_normalize(normalize, default, 'estimator')
+    assert _normalize == output
+
+    n_warnings = 0 if expected is None else 1
+    assert len(record) == n_warnings
+    if n_warnings:
+        assert all([
+            warning in str(record[0].message)
+            for warning in warning_msg
+        ])
+
+
 def test_linear_regression_sparse(random_state=0):
     # Test that linear regression also works with sparse data
     random_state = check_random_state(random_state)
@@ -165,6 +220,35 @@ def test_linear_regression_sparse(random_state=0):
         assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
 
 
+@pytest.mark.parametrize(
+    'normalize, n_warnings, warning',
+    [(True, 1, FutureWarning),
+     (False, 1, FutureWarning),
+     ("deprecated", 0, None)]
+)
+# FIXME remove test in 1.4
+def test_linear_regression_normalize_deprecation(
+     normalize, n_warnings, warning
+):
+    # check that we issue a FutureWarning when normalize was set in
+    # LinearRegression
+    rng = check_random_state(0)
+    n_samples = 200
+    n_features = 2
+    X = rng.randn(n_samples, n_features)
+    X[X < 0.1] = 0.0
+    y = rng.rand(n_samples)
+
+    model = LinearRegression(normalize=normalize)
+    with pytest.warns(warning) as record:
+        model.fit(X, y)
+    assert len(record) == n_warnings
+    if n_warnings:
+        assert "'normalize' was deprecated" in str(record[0].message)
+
+
+# FIXME: 'normalize' to be removed in 1.2 in LinearRegression
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize('normalize', [True, False])
 @pytest.mark.parametrize('fit_intercept', [True, False])
 def test_linear_regression_sparse_equal_dense(normalize, fit_intercept):
@@ -303,8 +387,9 @@ def test_linear_regression_pd_sparse_dataframe_warning():
         df[str(col)] = arr
 
     msg = "pandas.DataFrame with sparse columns found."
+
+    reg = LinearRegression()
     with pytest.warns(UserWarning, match=msg):
-        reg = LinearRegression()
         reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
 
     # does not warn when the whole dataframe is sparse
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -26,6 +26,7 @@
 from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import _convert_container
 from sklearn.utils._testing import TempMemmap
 from sklearn.utils.fixes import parse_version
 
@@ -301,6 +302,8 @@ def test_lasso_cv_positive_constraint():
     assert min(clf_constrained.coef_) >= 0
 
 
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize(
     "LinearModel, params",
     [(Lasso, {"tol": 1e-16, "alpha": 0.1}),
@@ -384,6 +387,60 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params):
     assert_allclose(y_pred_normalize, y_pred_standardize)
 
 
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
+@pytest.mark.parametrize(
+    "estimator, is_sparse, with_mean",
+    [(LinearRegression, True, False),
+     (LinearRegression, False, True),
+     (LinearRegression, False, False)]
+)
+def test_linear_model_sample_weights_normalize_in_pipeline(
+        estimator, is_sparse, with_mean
+):
+    # Test that the results for running linear regression LinearRegression with
+    # sample_weight set and with normalize set to True gives similar results as
+    # LinearRegression with no normalize in a pipeline with a StandardScaler
+    # and set sample_weight.
+    rng = np.random.RandomState(0)
+    X, y = make_regression(n_samples=20, n_features=5, noise=1e-2,
+                           random_state=rng)
+    # make sure the data is not centered to make the problem more
+    # difficult
+    X += 10
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
+                                                        random_state=rng)
+    if is_sparse:
+        X_train = sparse.csr_matrix(X_train)
+        X_test = _convert_container(X_train, 'sparse')
+
+    sample_weight = rng.rand(X_train.shape[0])
+
+    # linear estimator with explicit sample_weight
+    reg_with_normalize = estimator(normalize=True)
+    reg_with_normalize.fit(X_train, y_train, sample_weight=sample_weight)
+
+    # linear estimator in a pipeline
+    reg_with_scaler = make_pipeline(
+        StandardScaler(with_mean=with_mean),
+        estimator(normalize=False)
+    )
+    kwargs = {reg_with_scaler.steps[-1][0] + '__sample_weight':
+              sample_weight}
+    reg_with_scaler.fit(X_train, y_train, **kwargs)
+
+    y_pred_norm = reg_with_normalize.predict(X_test)
+    y_pred_pip = reg_with_scaler.predict(X_test)
+
+    assert_allclose(
+        reg_with_normalize.coef_ * reg_with_scaler[0].scale_,
+        reg_with_scaler[1].coef_
+    )
+    assert_allclose(y_pred_norm, y_pred_pip)
+
+
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize(
     "LinearModel, params",
     [(Lasso, {"tol": 1e-16, "alpha": 0.1}),