diff --git a/CHANGELOG.md b/CHANGELOG.md index 100d48d9..addc6860 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - [API Change] A new figure and axes is created (via `plt.subplots()`) when calling a plotting method with `ax=None`. Previously, the current axes was used (via `plt.gca()`) ([#211](https://github.com/ploomber/sklearn-evaluation/pull/211)) - [Fix] Validating input elbow curve model has "score" method [#146] - [Fix] Adds class labels for multi class roc plot (#209) +- [API Change] `silhouette_analysis_from_results` function now accepts a list of cluster labels [#213](https://github.com/ploomber/sklearn-evaluation/pull/213) ## 0.9.0 (2023-01-13) diff --git a/docs/api/plot.rst b/docs/api/plot.rst index 503cd3be..7907ed55 100644 --- a/docs/api/plot.rst +++ b/docs/api/plot.rst @@ -59,6 +59,8 @@ elbow_curve ----------- .. autofunction:: sklearn_evaluation.plot.elbow_curve +.. _elbow-curve-from-results-label: + elbow_curve_from_results ------------------------ .. autofunction:: sklearn_evaluation.plot.elbow_curve_from_results @@ -115,6 +117,8 @@ silhouette_analysis ------------------- .. autofunction:: sklearn_evaluation.plot.silhouette_analysis +.. _silhouette-analysis-from-results-label: + silhouette_analysis_from_results -------------------------------- .. autofunction:: sklearn_evaluation.plot.silhouette_analysis_from_results diff --git a/docs/clustering/clustering_evaluation.md b/docs/clustering/clustering_evaluation.md index e4cc47c2..8716081d 100644 --- a/docs/clustering/clustering_evaluation.md +++ b/docs/clustering/clustering_evaluation.md @@ -54,13 +54,11 @@ Elbow curve helps to identify the point at which the plot starts to become paral plot.elbow_curve(X, kmeans, range_n_clusters=range(1, 30)) ``` -##### Elbow curve from results +```{eval-rst} +.. tip:: + + If you want to train the models yourself, you can use :ref:`elbow-curve-from-results-label` to plot. -```{code-cell} ipython3 -import numpy as np -n_clusters = range(1, 10, 2) -sum_of_squares = np.array([4572.2, 470.7, 389.9, 335.1, 305.5]) -plot.elbow_curve_from_results(n_clusters, sum_of_squares, times=None) ``` ##### Silhouette plot @@ -71,23 +69,9 @@ The below plot shows that n_clusters value of 3, 5 and 6 are a bad pick for the silhouette = plot.silhouette_analysis(X, kmeans) ``` -##### Silhouette plot from cluster labels - -```{code-cell} ipython3 -X, y = datasets.make_blobs( - n_samples=500, - n_features=2, - centers=4, - cluster_std=1, - center_box=(-10.0, 10.0), - shuffle=True, - random_state=1, -) +```{eval-rst} +.. tip:: -kmeans = KMeans(n_clusters=4, random_state=1, n_init=5) -cluster_labels = kmeans.fit_predict(X) -``` + If you want to train the models yourself, you can use :ref:`silhouette-analysis-from-results-label` to plot. -```{code-cell} ipython3 -silhouette = plot.silhouette_analysis_from_results(X, cluster_labels) -``` +``` \ No newline at end of file diff --git a/examples/elbow_curve_from_results.py b/examples/elbow_curve_from_results.py new file mode 100644 index 00000000..d3630ffa --- /dev/null +++ b/examples/elbow_curve_from_results.py @@ -0,0 +1,21 @@ +import time +import numpy as np + +from sklearn.cluster import KMeans +from sklearn.datasets import make_blobs + +from sklearn_evaluation import plot + +X, _ = make_blobs(n_samples=100, centers=3, n_features=5, random_state=0) + +n_clusters = range(1, 30) +sum_of_squares = [] +cluster_times = [] +for i in n_clusters: + start = time.time() + kmeans = KMeans(n_clusters=i, n_init=5) + sum_of_squares.append(kmeans.fit(X).score(X)) + cluster_times.append(time.time() - start) + +sum_of_squares = np.absolute(sum_of_squares) +plot.elbow_curve_from_results(n_clusters, sum_of_squares, cluster_times) diff --git a/examples/silhouette_plot_basic.py b/examples/silhouette_plot_basic.py index 9d74b825..49a2eced 100644 --- a/examples/silhouette_plot_basic.py +++ b/examples/silhouette_plot_basic.py @@ -1,6 +1,5 @@ from sklearn.cluster import KMeans from sklearn.datasets import make_blobs -import matplotlib.pyplot as plt from sklearn_evaluation import plot @@ -16,4 +15,3 @@ kmeans = KMeans(random_state=1, n_init=5) plot.silhouette_analysis(X, kmeans, range_n_clusters=[3]) -plt.show() diff --git a/examples/silhouette_plot_from_results.py b/examples/silhouette_plot_from_results.py index b2f38ab6..a8f6f2cb 100644 --- a/examples/silhouette_plot_from_results.py +++ b/examples/silhouette_plot_from_results.py @@ -1,6 +1,5 @@ from sklearn.cluster import KMeans from sklearn.datasets import make_blobs -import matplotlib.pyplot as plt from sklearn_evaluation import plot @@ -14,7 +13,15 @@ random_state=1, ) -kmeans = KMeans(n_clusters=4, random_state=1, n_init=5) -cluster_labels = kmeans.fit_predict(X) +cluster_labels = [] + +# Cluster labels for four clusters +kmeans = KMeans(n_clusters=4, n_init=5) +cluster_labels.append(kmeans.fit_predict(X)) + +# Cluster labels for five clusters +kmeans = KMeans(n_clusters=5, n_init=5) +cluster_labels.append(kmeans.fit_predict(X)) + + plot.silhouette_analysis_from_results(X, cluster_labels) -plt.show() diff --git a/src/sklearn_evaluation/plot/__init__.py b/src/sklearn_evaluation/plot/__init__.py index 3ddf7c4a..47b62a5e 100644 --- a/src/sklearn_evaluation/plot/__init__.py +++ b/src/sklearn_evaluation/plot/__init__.py @@ -21,15 +21,22 @@ silhouette_analysis, silhouette_analysis_from_results, ) -from sklearn_evaluation.plot.regression \ - import residuals, prediction_error, cooks_distance +from sklearn_evaluation.plot.regression import ( + residuals, + prediction_error, + cooks_distance, +) from sklearn_evaluation.plot.target_analysis import target_analysis from sklearn_evaluation.plot.calibration import calibration_curve, scores_distribution -from sklearn_evaluation.plot.classification_report \ - import classification_report, ClassificationReport +from sklearn_evaluation.plot.classification_report import ( + classification_report, + ClassificationReport, +) from sklearn_evaluation.plot.ks_statistics import ks_statistic -from sklearn_evaluation.plot.cumulative_gain_lift_curve \ - import cumulative_gain, lift_curve +from sklearn_evaluation.plot.cumulative_gain_lift_curve import ( + cumulative_gain, + lift_curve, +) from sklearn_evaluation.plot.feature_ranking import Rank1D, Rank2D __all__ = [ diff --git a/src/sklearn_evaluation/plot/clustering.py b/src/sklearn_evaluation/plot/clustering.py index 92a05646..2988fab0 100644 --- a/src/sklearn_evaluation/plot/clustering.py +++ b/src/sklearn_evaluation/plot/clustering.py @@ -40,7 +40,23 @@ from ploomber_core.exceptions import modify_exceptions from ploomber_core import deprecated -# TODO: add unit test + +def _generate_axes(cluster, figsize, ax): + if ax is not None: + if not isinstance(ax, list): + ax = [ax] + if len(cluster) != len(ax): + raise ValueError( + f"Received lengths, cluster : {len(cluster)}," + f"axes : {len(ax)}." + f"Number of axes passed should match number of clusters" + ) + else: + ax = [] + for i in range(len(cluster)): + _, axes = plt.subplots(1, 1, figsize=figsize) + ax.append(axes) + return ax @SKLearnEvaluationLogger.log(feature="plot") @@ -139,9 +155,13 @@ def elbow_curve_from_results(n_clusters, sum_of_squares, times, ax=None): """ Same as `elbow_curve`, but it takes the number of clusters and sum of squares as inputs. Useful if you want to train the models yourself. + + Examples + -------- + .. plot:: ../examples/elbow_curve_from_results.py + """ - # TODO: unit test this - # TODO: also test with unsorted input + idx = np.argsort(n_clusters) n_clusters = np.array(n_clusters)[idx] sum_of_squares = np.array(sum_of_squares)[idx] @@ -183,6 +203,7 @@ def _clone_and_score_clusterer(clf, X, n_clusters): @SKLearnEvaluationLogger.log(feature="plot") +@modify_exceptions def silhouette_analysis( X, clf, @@ -262,13 +283,16 @@ def silhouette_analysis( "Cannot plot silhouette analysis ." ) - for n_clusters in range_n_clusters: - _, ax = plt.subplots(1, 1, figsize=figsize) + # if no ax is passed by user generate new plot + # for each model + + ax = _generate_axes(range_n_clusters, figsize, ax) + + for ax, n_clusters in zip(ax, range_n_clusters): clf = clone(clf) setattr(clf, "n_clusters", n_clusters) cluster_labels = clf.fit_predict(X) - - ax = silhouette_analysis_from_results( + _silhouette_analysis_one_model( X, cluster_labels, metric, figsize, cmap, text_fontsize, ax ) return ax @@ -276,7 +300,7 @@ def silhouette_analysis( @SKLearnEvaluationLogger.log(feature="plot") @modify_exceptions -def silhouette_analysis_from_results( +def _silhouette_analysis_one_model( X, cluster_labels, metric="euclidean", @@ -286,12 +310,7 @@ def silhouette_analysis_from_results( ax=None, ): """ - Same as `silhouette_plot` but takes cluster_labels as input. - Useful if you want to train the model yourself - - Notes - ----- - .. versionadded:: 0.8.3 + Generate silhouette plot for one value of n_cluster. """ cluster_labels = np.asarray(cluster_labels) @@ -365,3 +384,36 @@ def silhouette_analysis_from_results( ax.tick_params(labelsize=text_fontsize) ax.legend(loc="best", fontsize=text_fontsize) return ax + + +@SKLearnEvaluationLogger.log(feature="plot") +@modify_exceptions +def silhouette_analysis_from_results( + X, + cluster_labels, + metric="euclidean", + figsize=None, + cmap="nipy_spectral", + text_fontsize="medium", + ax=None, +): + """ + Same as `silhouette_plot` but takes list of cluster_labels as input. + Useful if you want to train the model yourself + + Examples + -------- + .. plot:: ../examples/silhouette_plot_from_results.py + + """ + + # if no ax is passed by user generate new plot + # for each model + + ax = _generate_axes(cluster_labels, figsize, ax) + + for ax, label in zip(ax, cluster_labels): + _silhouette_analysis_one_model( + X, label, metric, figsize, cmap, text_fontsize, ax + ) + return ax diff --git a/tests/baseline_images/test_clustering/silhouette_analysis_from_results_cluster_five.png b/tests/baseline_images/test_clustering/silhouette_analysis_from_results_cluster_five.png new file mode 100644 index 00000000..5247e129 Binary files /dev/null and b/tests/baseline_images/test_clustering/silhouette_analysis_from_results_cluster_five.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_analysis_from_results_cluster_six.png b/tests/baseline_images/test_clustering/silhouette_analysis_from_results_cluster_six.png new file mode 100644 index 00000000..cfb3c331 Binary files /dev/null and b/tests/baseline_images/test_clustering/silhouette_analysis_from_results_cluster_six.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_plot_cosine.png b/tests/baseline_images/test_clustering/silhouette_plot_cosine.png index 26b39be4..3cdfc40c 100644 Binary files a/tests/baseline_images/test_clustering/silhouette_plot_cosine.png and b/tests/baseline_images/test_clustering/silhouette_plot_cosine.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_plot_five_clusters.png b/tests/baseline_images/test_clustering/silhouette_plot_five_clusters.png index c1b5c622..5247e129 100644 Binary files a/tests/baseline_images/test_clustering/silhouette_plot_five_clusters.png and b/tests/baseline_images/test_clustering/silhouette_plot_five_clusters.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_plot_five_clusters_minibatchkmeans.png b/tests/baseline_images/test_clustering/silhouette_plot_five_clusters_minibatchkmeans.png index 01725e5b..73e7a460 100644 Binary files a/tests/baseline_images/test_clustering/silhouette_plot_five_clusters_minibatchkmeans.png and b/tests/baseline_images/test_clustering/silhouette_plot_five_clusters_minibatchkmeans.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_plot_four_clusters.png b/tests/baseline_images/test_clustering/silhouette_plot_four_clusters.png index 7a14a9a2..dab8d8f8 100644 Binary files a/tests/baseline_images/test_clustering/silhouette_plot_four_clusters.png and b/tests/baseline_images/test_clustering/silhouette_plot_four_clusters.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_plot_four_clusters_minibatchkmeans.png b/tests/baseline_images/test_clustering/silhouette_plot_four_clusters_minibatchkmeans.png index 1b61155c..37c0762c 100644 Binary files a/tests/baseline_images/test_clustering/silhouette_plot_four_clusters_minibatchkmeans.png and b/tests/baseline_images/test_clustering/silhouette_plot_four_clusters_minibatchkmeans.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_plot_six_clusters.png b/tests/baseline_images/test_clustering/silhouette_plot_six_clusters.png index b3b9e197..cfb3c331 100644 Binary files a/tests/baseline_images/test_clustering/silhouette_plot_six_clusters.png and b/tests/baseline_images/test_clustering/silhouette_plot_six_clusters.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_plot_spectral.png b/tests/baseline_images/test_clustering/silhouette_plot_spectral.png index 06ff3be5..df8e6780 100644 Binary files a/tests/baseline_images/test_clustering/silhouette_plot_spectral.png and b/tests/baseline_images/test_clustering/silhouette_plot_spectral.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_plot_three_clusters.png b/tests/baseline_images/test_clustering/silhouette_plot_three_clusters.png index 24d4c38c..9a9e95ad 100644 Binary files a/tests/baseline_images/test_clustering/silhouette_plot_three_clusters.png and b/tests/baseline_images/test_clustering/silhouette_plot_three_clusters.png differ diff --git a/tests/baseline_images/test_clustering/silhouette_plot_two_clusters.png b/tests/baseline_images/test_clustering/silhouette_plot_two_clusters.png index 2bba1cac..80d9aa53 100644 Binary files a/tests/baseline_images/test_clustering/silhouette_plot_two_clusters.png and b/tests/baseline_images/test_clustering/silhouette_plot_two_clusters.png differ diff --git a/tests/test_clustering.py b/tests/test_clustering.py index d074f2fa..31a4f67e 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -188,10 +188,7 @@ def test_n_jobs(): ) def test_plot_silhouette(): clf = KMeans(random_state=10) - # original = plt.rcParams["figure.figsize"] - plt.rcParams["figure.figsize"] = (18, 7) plot.silhouette_analysis(X, clf) - # plt.rcParams["figure.figsize"] = original @image_comparison( @@ -201,7 +198,6 @@ def test_plot_silhouette(): ) def test_plot_silhouette_with_cluster_range(): clf = KMeans(random_state=10) - plt.rcParams["figure.figsize"] = (18, 7) plot.silhouette_analysis(X, clf, range_n_clusters=[4, 5]) @@ -215,7 +211,6 @@ def test_plot_silhouette_with_cluster_range(): ) def test_plot_silhouette_with_minibatchkmeans(): clf = MiniBatchKMeans(random_state=10) - plt.rcParams["figure.figsize"] = (18, 7) plot.silhouette_analysis(X, clf, range_n_clusters=[4, 5]) @@ -224,7 +219,6 @@ def test_plot_silhouette_with_minibatchkmeans(): ) def test_cmap(): clf = KMeans(random_state=10) - plt.rcParams["figure.figsize"] = (18, 7) plot.silhouette_analysis(X, clf, range_n_clusters=[2], cmap="Spectral") @@ -233,53 +227,32 @@ def test_cmap(): ) def test_metric(): clf = KMeans(random_state=10) - plt.rcParams["figure.figsize"] = (18, 7) plot.silhouette_analysis(X, clf, range_n_clusters=[6], metric="cosine") -def test_string_classes(): - clf = KMeans() - cluster_labels = clf.fit_predict(X) - plot.silhouette_analysis_from_results(X, convert_labels_into_string(cluster_labels)) - - -@image_comparison( - baseline_images=["silhouette_plot_array_like"], - extensions=["png"], - remove_text=False, -) def test_array_like(): - plot.silhouette_analysis_from_results(X.tolist(), y.tolist()) - - -@image_comparison( - baseline_images=["silhouette_plot_array_like_string_label"], - extensions=["png"], - remove_text=False, -) -def test_array_like_string(): - plot.silhouette_analysis_from_results(X.tolist(), convert_labels_into_string(y)) + clf = KMeans() + plot.silhouette_analysis(X.tolist(), clf) def test_ax_silhouette(): clf = KMeans() - cluster_labels = clf.fit_predict(X) - plot.silhouette_analysis_from_results(X, cluster_labels) - fig, ax = plt.subplots(1, 1) - out_ax = plot.silhouette_analysis_from_results(X, cluster_labels) - assert ax is not out_ax - out_ax = plot.silhouette_analysis_from_results(X, cluster_labels, ax=ax) - assert ax is out_ax + ax = [] + for i in range(5): + fig, axes = plt.subplots(1, 1) + ax.append(axes) + out_ax = plot.silhouette_analysis(X, clf) + for axes in ax: + assert axes is not out_ax + out_ax = plot.silhouette_analysis(X, clf, ax=ax) + assert ax[-1] is out_ax def test_ax_params(): - clf = KMeans(n_clusters=8) - cluster_labels = clf.fit_predict(X) - out_ax = plot.silhouette_analysis_from_results( - X, cluster_labels, text_fontsize="large" - ) - assert out_ax.get_title() == "Silhouette Analysis (n_clusters=8)" - assert out_ax.get_ylim() == (0.0, 250.0) + clf = KMeans(n_clusters=6) + out_ax = plot.silhouette_analysis(X, clf, text_fontsize="large") + assert out_ax.get_title() == "Silhouette Analysis (n_clusters=6)" + assert out_ax.get_ylim() == (0.0, 230.0) def test_invalid_clusterer(): @@ -288,18 +261,72 @@ def test_invalid_clusterer(): plot.silhouette_analysis(X, clf) -def test_silhouette_analysis_from_results_value_error(ploomber_value_error_message): +def test_silhouette_analysis_value_error(ploomber_value_error_message): + clf = KMeans() with pytest.raises(ValueError, match=ploomber_value_error_message) as e: - plot.silhouette_analysis_from_results([], y.tolist()) + plot.silhouette_analysis([], clf) assert "Expected 2D array, got 1D array" in str(e.value) -def test_from_results_call(monkeypatch): +def test_from_one_model_call(monkeypatch): mock = Mock() - monkeypatch.setattr(cl, "silhouette_analysis_from_results", mock) + monkeypatch.setattr(cl, "_silhouette_analysis_one_model", mock) clf = KMeans() - fig, ax = plt.subplots(1, 1) - ax = plot.silhouette_analysis(X, clf, range_n_clusters=[2, 3], ax=ax) + fig, ax1 = plt.subplots(1, 1) + fig, ax2 = plt.subplots(1, 1) + plot.silhouette_analysis(X, clf, range_n_clusters=[2, 3], ax=[ax1, ax2]) assert mock.call_count == 2 - assert mock.return_value == ax + + +@image_comparison( + baseline_images=[ + "silhouette_analysis_from_results_cluster_five", + "silhouette_analysis_from_results_cluster_six", + ], + extensions=["png"], + remove_text=False, +) +def test_silhouette_plot_from_results(): + cluster_labels = [ + KMeans(random_state=10, n_clusters=5).fit_predict(X), + KMeans(random_state=10, n_clusters=6).fit_predict(X), + ] + + plot.silhouette_analysis_from_results(X, cluster_labels) + + +def test_silhouette_analysis_from_results_value_error(ploomber_value_error_message): + with pytest.raises(ValueError, match=ploomber_value_error_message) as e: + plot.silhouette_analysis_from_results([], [y.tolist()]) + + assert "Expected 2D array, got 1D array" in str(e.value) + + +def test_ax_silhouette_from_results(): + X = [[1.2, 3.4], [2.2, 4.1], [1.1, 6.5]] + cluster_labels = [[0, 0, 1], [0, 1, 0]] + fig, ax1 = plt.subplots(1, 1) + fig, ax2 = plt.subplots(1, 1) + out_ax = plot.silhouette_analysis_from_results(X, cluster_labels) + assert ax1 is not out_ax + assert ax2 is not out_ax + out_ax = plot.silhouette_analysis_from_results(X, cluster_labels, ax=[ax1, ax2]) + assert ax2 is out_ax + + +def test_one_model_from_results_call(monkeypatch): + X = [[1.2, 3.4], [2.2, 4.1], [1.1, 6.5]] + cluster_labels = [[0, 0, 1], [0, 1, 0], [0, 0, 0]] + mock = Mock() + monkeypatch.setattr(cl, "_silhouette_analysis_one_model", mock) + plot.silhouette_analysis_from_results(X, cluster_labels) + assert mock.call_count == 3 + + +def test_ax_length_mismatch(ploomber_value_error_message): + X = [[1.2, 3.4], [2.2, 4.1], [1.1, 6.5]] + cluster_labels = [[0, 0, 1], [0, 1, 0]] + fig, ax = plt.subplots(1, 1) + with pytest.raises(ValueError, match=ploomber_value_error_message): + plot.silhouette_analysis_from_results(X, cluster_labels, ax=[ax])