1
+ import numpy as np
2
+ import math
1
3
import datetime
2
4
from deepdiff .deephash import DeepHash
3
5
from deepdiff .helper import (
@@ -31,7 +33,7 @@ def _get_rough_distance(self):
31
33
"""
32
34
33
35
_distance = get_numeric_types_distance (
34
- self .t1 , self .t2 , max_ = self .cutoff_distance_for_pairs )
36
+ self .t1 , self .t2 , max_ = self .cutoff_distance_for_pairs , use_log_scale = self . use_log_scale , log_scale_similarity_threshold = self . log_scale_similarity_threshold )
35
37
36
38
if _distance is not not_found :
37
39
return _distance
@@ -122,7 +124,10 @@ def _precalculate_numpy_arrays_distance(
122
124
123
125
distances = _get_numpy_array_distance (
124
126
pairs_transposed [0 ], pairs_transposed [1 ],
125
- max_ = self .cutoff_distance_for_pairs )
127
+ max_ = self .cutoff_distance_for_pairs ,
128
+ use_log_scale = self .use_log_scale ,
129
+ log_scale_similarity_threshold = self .log_scale_similarity_threshold ,
130
+ )
126
131
127
132
i = 0
128
133
for added_hash in hashes_added :
@@ -186,14 +191,19 @@ def _get_item_length(item, parents_ids=frozenset([])):
186
191
return length
187
192
188
193
189
- def _get_numbers_distance (num1 , num2 , max_ = 1 ):
194
+ def _get_numbers_distance (num1 , num2 , max_ = 1 , use_log_scale = False , log_scale_similarity_threshold = 0.1 ):
190
195
"""
191
196
Get the distance of 2 numbers. The output is a number between 0 to the max.
192
197
The reason is the
193
198
When max is returned means the 2 numbers are really far, and 0 means they are equal.
194
199
"""
195
200
if num1 == num2 :
196
201
return 0
202
+ if use_log_scale :
203
+ distance = logarithmic_distance (num1 , num2 )
204
+ if distance < logarithmic_distance :
205
+ return 0
206
+ return distance
197
207
if not isinstance (num1 , float ):
198
208
num1 = float (num1 )
199
209
if not isinstance (num2 , float ):
@@ -218,8 +228,42 @@ def _numpy_div(a, b, replace_inf_with=1):
218
228
result [a == b ] = 0
219
229
return result
220
230
231
+ # To deal with numbers close to zero
232
+ MATH_LOG_OFFSET = 1e-10
233
+
234
+ def numpy_apply_log_keep_sign (array , offset = MATH_LOG_OFFSET ):
235
+ # Calculate the absolute value and add the offset
236
+ abs_plus_offset = np .abs (array ) + offset
237
+
238
+ # Calculate the logarithm
239
+ log_values = np .log (abs_plus_offset )
240
+
241
+ # Apply the original signs to the log values
242
+ signed_log_values = np .copysign (log_values , array )
243
+
244
+ return signed_log_values
245
+
221
246
222
- def _get_numpy_array_distance (num1 , num2 , max_ = 1 ):
247
+ def logarithmic_similarity (a : numbers , b : numbers , threshold : float = 0.1 ):
248
+ """
249
+ A threshold of 0.1 translates to about 10.5% difference.
250
+ A threshold of 0.5 translates to about 65% difference.
251
+ A threshold of 0.05 translates to about 5.1% difference.
252
+ """
253
+ return logarithmic_distance (a , b ) < threshold
254
+
255
+
256
+ def logarithmic_distance (a : numbers , b : numbers ):
257
+ # Apply logarithm to the absolute values and consider the sign
258
+ a = float (a )
259
+ b = float (b )
260
+ log_a = math .copysign (math .log (abs (a ) + MATH_LOG_OFFSET ), a )
261
+ log_b = math .copysign (math .log (abs (b ) + MATH_LOG_OFFSET ), b )
262
+
263
+ return abs (log_a - log_b )
264
+
265
+
266
+ def _get_numpy_array_distance (num1 , num2 , max_ = 1 , use_log_scale = False , log_scale_similarity_threshold = 0.1 ):
223
267
"""
224
268
Get the distance of 2 numbers. The output is a number between 0 to the max.
225
269
The reason is the
@@ -229,24 +273,32 @@ def _get_numpy_array_distance(num1, num2, max_=1):
229
273
# getting the pairs of items during the ingore_order=True
230
274
# calculations, we need to make the divisor of comparison very big
231
275
# so that any 2 numbers can be chosen as pairs.
276
+ if use_log_scale :
277
+ num1 = numpy_apply_log_keep_sign (num1 )
278
+ num2 = numpy_apply_log_keep_sign (num2 )
279
+
232
280
divisor = (num1 + num2 ) / max_
233
281
result = _numpy_div ((num1 - num2 ), divisor , replace_inf_with = max_ )
234
- return np .clip (np .absolute (result ), 0 , max_ )
282
+
283
+ distance_array = np .clip (np .absolute (result ), 0 , max_ )
284
+ if use_log_scale :
285
+ distance_array [distance_array < log_scale_similarity_threshold ] = 0
286
+ return distance_array
235
287
236
288
237
- def _get_datetime_distance (date1 , date2 , max_ ):
289
+ def _get_datetime_distance (date1 , date2 , max_ , use_log_scale , log_scale_similarity_threshold ):
238
290
return _get_numbers_distance (date1 .timestamp (), date2 .timestamp (), max_ )
239
291
240
292
241
- def _get_date_distance (date1 , date2 , max_ ):
293
+ def _get_date_distance (date1 , date2 , max_ , use_log_scale , log_scale_similarity_threshold ):
242
294
return _get_numbers_distance (date1 .toordinal (), date2 .toordinal (), max_ )
243
295
244
296
245
- def _get_timedelta_distance (timedelta1 , timedelta2 , max_ ):
297
+ def _get_timedelta_distance (timedelta1 , timedelta2 , max_ , use_log_scale , log_scale_similarity_threshold ):
246
298
return _get_numbers_distance (timedelta1 .total_seconds (), timedelta2 .total_seconds (), max_ )
247
299
248
300
249
- def _get_time_distance (time1 , time2 , max_ ):
301
+ def _get_time_distance (time1 , time2 , max_ , use_log_scale , log_scale_similarity_threshold ):
250
302
return _get_numbers_distance (time_to_seconds (time1 ), time_to_seconds (time2 ), max_ )
251
303
252
304
@@ -259,8 +311,8 @@ def _get_time_distance(time1, time2, max_):
259
311
]
260
312
261
313
262
- def get_numeric_types_distance (num1 , num2 , max_ ):
314
+ def get_numeric_types_distance (num1 , num2 , max_ , use_log_scale = False , log_scale_similarity_threshold = 0.1 ):
263
315
for type_ , func in TYPES_TO_DIST_FUNC :
264
316
if isinstance (num1 , type_ ) and isinstance (num2 , type_ ):
265
- return func (num1 , num2 , max_ )
317
+ return func (num1 , num2 , max_ , use_log_scale , log_scale_similarity_threshold )
266
318
return not_found
0 commit comments