Skip to content

Commit 200568e

Browse files
committed
Add calculate_scaled_cosine_similarity function to latent_experiments.py
1 parent 8f2f48f commit 200568e

File tree

2 files changed

+55
-0
lines changed

2 files changed

+55
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,4 @@ cython_debug/
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160160
#.idea/
161+
.DS_Store
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import pandas as pd
2+
from sklearn.metrics.pairwise import cosine_similarity
3+
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer, MaxAbsScaler, PowerTransformer
4+
5+
def calculate_scaled_cosine_similarity(data, scale_method='minmax'):
6+
"""
7+
Calculate the scaled cosine similarity matrix for the given data.
8+
9+
Parameters:
10+
- data: The input data to calculate the cosine similarity matrix for.
11+
- scale_method: The method to scale the data. Default is 'minmax'.
12+
13+
Returns:
14+
- cosine_similarity_df: The cosine similarity matrix, with the maximum similarity index for each row.
15+
16+
Raises:
17+
- ValueError: If the specified scale method is not recognized.
18+
19+
"""
20+
21+
# Scale the data according to the specified method
22+
scalers = {
23+
'minmax': MinMaxScaler(),
24+
'standard': StandardScaler(),
25+
'robust': RobustScaler(),
26+
'l2': Normalizer(norm='l2'),
27+
'l1': Normalizer(norm='l1'),
28+
'maxabs': MaxAbsScaler(),
29+
'yeojohnson': PowerTransformer(method='yeo-johnson')
30+
}
31+
32+
scaler = scalers.get(scale_method.lower())
33+
34+
if not scaler:
35+
raise ValueError(f"Scaling method '{scale_method}' is not recognized. "
36+
"Choose from 'minmax', 'standard', 'robust', 'l2', 'l1', 'maxabs', or 'yeojohnson'.")
37+
38+
normalized_data = scaler.fit_transform(data)
39+
40+
normalized_df = pd.DataFrame(normalized_data, columns=data.columns)
41+
42+
cosine_similarity_matrix = cosine_similarity(normalized_df)
43+
44+
# Set diagonal to 0 from 1
45+
np.fill_diagonal(cosine_similarity_matrix, 0)
46+
47+
# Convert to DataFrame and add max_index column
48+
cosine_similarity_df = pd.DataFrame(cosine_similarity_matrix,
49+
index=normalized_df.index,
50+
columns=normalized_df.index)
51+
cosine_similarity_df['max_index'] = cosine_similarity_df.idxmax(axis=1)
52+
53+
return cosine_similarity_df
54+

0 commit comments

Comments
 (0)