|
| 1 | +import pandas as pd |
| 2 | +from sklearn.metrics.pairwise import cosine_similarity |
| 3 | +from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer, MaxAbsScaler, PowerTransformer |
| 4 | + |
| 5 | +def calculate_scaled_cosine_similarity(data, scale_method='minmax'): |
| 6 | + """ |
| 7 | + Calculate the scaled cosine similarity matrix for the given data. |
| 8 | +
|
| 9 | + Parameters: |
| 10 | + - data: The input data to calculate the cosine similarity matrix for. |
| 11 | + - scale_method: The method to scale the data. Default is 'minmax'. |
| 12 | +
|
| 13 | + Returns: |
| 14 | + - cosine_similarity_df: The cosine similarity matrix, with the maximum similarity index for each row. |
| 15 | +
|
| 16 | + Raises: |
| 17 | + - ValueError: If the specified scale method is not recognized. |
| 18 | +
|
| 19 | + """ |
| 20 | + |
| 21 | + # Scale the data according to the specified method |
| 22 | + scalers = { |
| 23 | + 'minmax': MinMaxScaler(), |
| 24 | + 'standard': StandardScaler(), |
| 25 | + 'robust': RobustScaler(), |
| 26 | + 'l2': Normalizer(norm='l2'), |
| 27 | + 'l1': Normalizer(norm='l1'), |
| 28 | + 'maxabs': MaxAbsScaler(), |
| 29 | + 'yeojohnson': PowerTransformer(method='yeo-johnson') |
| 30 | + } |
| 31 | + |
| 32 | + scaler = scalers.get(scale_method.lower()) |
| 33 | + |
| 34 | + if not scaler: |
| 35 | + raise ValueError(f"Scaling method '{scale_method}' is not recognized. " |
| 36 | + "Choose from 'minmax', 'standard', 'robust', 'l2', 'l1', 'maxabs', or 'yeojohnson'.") |
| 37 | + |
| 38 | + normalized_data = scaler.fit_transform(data) |
| 39 | + |
| 40 | + normalized_df = pd.DataFrame(normalized_data, columns=data.columns) |
| 41 | + |
| 42 | + cosine_similarity_matrix = cosine_similarity(normalized_df) |
| 43 | + |
| 44 | + # Set diagonal to 0 from 1 |
| 45 | + np.fill_diagonal(cosine_similarity_matrix, 0) |
| 46 | + |
| 47 | + # Convert to DataFrame and add max_index column |
| 48 | + cosine_similarity_df = pd.DataFrame(cosine_similarity_matrix, |
| 49 | + index=normalized_df.index, |
| 50 | + columns=normalized_df.index) |
| 51 | + cosine_similarity_df['max_index'] = cosine_similarity_df.idxmax(axis=1) |
| 52 | + |
| 53 | + return cosine_similarity_df |
| 54 | + |
0 commit comments