Skip to content

Commit ff6ff87

Browse files
committed
adding use_log_scale and log_scale_similarity_threshold
1 parent a739a50 commit ff6ff87

8 files changed

+138
-34
lines changed

deepdiff/diff.py

+12-8
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
np, get_truncate_datetime, dict_, CannotCompare, ENUM_INCLUDE_KEYS,
2828
PydanticBaseModel, Opcode, SetOrdered)
2929
from deepdiff.serialization import SerializationMixin
30-
from deepdiff.distance import DistanceMixin
30+
from deepdiff.distance import DistanceMixin, logarithmic_similarity
3131
from deepdiff.model import (
3232
RemapDict, ResultDict, TextResult, TreeResult, DiffLevel,
3333
DictRelationship, AttributeRelationship, REPORT_KEYS,
@@ -157,7 +157,9 @@ def __init__(self,
157157
progress_logger: Callable=logger.info,
158158
report_repetition: bool=False,
159159
significant_digits: Optional[int]=None,
160-
threshold_to_diff_deeper: float = 0,
160+
use_log_scale: bool=False,
161+
log_scale_similarity_threshold: int=0.1,
162+
threshold_to_diff_deeper: float = 0.33,
161163
truncate_datetime: Optional[str]=None,
162164
use_enum_value: bool=False,
163165
verbose_level: int=1,
@@ -178,7 +180,7 @@ def __init__(self,
178180
"cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
179181
"cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, "
180182
"math_epsilon, iterable_compare_func, use_enum_value, _original_type, threshold_to_diff_deeper, "
181-
"ignore_order_func, custom_operators, encodings, ignore_encoding_errors, "
183+
"ignore_order_func, custom_operators, encodings, ignore_encoding_errors, use_log_scale, log_scale_similarity_threshold "
182184
"_parameters and _shared_parameters.") % ', '.join(kwargs.keys()))
183185

184186
if _parameters:
@@ -196,6 +198,8 @@ def __init__(self,
196198
if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
197199
ignore_string_type_changes = True
198200
self.use_enum_value = use_enum_value
201+
self.log_scale_similarity_threshold = log_scale_similarity_threshold
202+
self.use_log_scale = use_log_scale
199203
self.threshold_to_diff_deeper = threshold_to_diff_deeper
200204
self.ignore_string_type_changes = ignore_string_type_changes
201205
self.ignore_type_in_groups = self.get_ignore_types_in_groups(
@@ -583,9 +587,8 @@ def _diff_dict(
583587
t_keys_union = t2_keys | t1_keys
584588
t_keys_added = t2_keys - t_keys_intersect
585589
t_keys_removed = t1_keys - t_keys_intersect
586-
587590
if self.threshold_to_diff_deeper:
588-
if len(t_keys_union) and len(t_keys_intersect) / len(t_keys_union) < self.threshold_to_diff_deeper:
591+
if len(t_keys_union) > 1 and len(t_keys_intersect) / len(t_keys_union) < self.threshold_to_diff_deeper:
589592
self._report_result('values_changed', level, local_tree=local_tree)
590593
return
591594

@@ -1145,7 +1148,6 @@ def defaultdict_orderedset():
11451148
pairs = dict_()
11461149

11471150
pre_calced_distances = None
1148-
11491151
if hashes_added and hashes_removed and np and len(hashes_added) > 1 and len(hashes_removed) > 1:
11501152
# pre-calculates distances ONLY for 1D arrays whether an _original_type
11511153
# was explicitly passed or a homogeneous array is detected.
@@ -1233,7 +1235,6 @@ def _diff_iterable_with_deephash(self, level, parents_ids, _original_type=None,
12331235
else:
12341236
t1_hashtable = {k: v for k, v in full_t1_hashtable.items() if k in hashes_removed}
12351237
t2_hashtable = {k: v for k, v in full_t2_hashtable.items() if k in hashes_added}
1236-
12371238
if self._stats[PASSES_COUNT] < self.max_passes and get_pairs:
12381239
self._stats[PASSES_COUNT] += 1
12391240
pairs = self._get_most_in_common_pairs_in_iterables(
@@ -1403,7 +1404,10 @@ def _diff_numbers(self, level, local_tree=None, report_type_change=True):
14031404
else:
14041405
t1_type = t2_type = ''
14051406

1406-
if self.math_epsilon is not None:
1407+
if self.use_log_scale:
1408+
if not logarithmic_similarity(level.t1, level.t2, threshold=self.log_scale_similarity_threshold):
1409+
self._report_result('values_changed', level, local_tree=local_tree)
1410+
elif self.math_epsilon is not None:
14071411
if not is_close(level.t1, level.t2, abs_tol=self.math_epsilon):
14081412
self._report_result('values_changed', level, local_tree=local_tree)
14091413
elif self.significant_digits is None:

deepdiff/distance.py

+63-11
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import numpy as np
2+
import math
13
import datetime
24
from deepdiff.deephash import DeepHash
35
from deepdiff.helper import (
@@ -31,7 +33,7 @@ def _get_rough_distance(self):
3133
"""
3234

3335
_distance = get_numeric_types_distance(
34-
self.t1, self.t2, max_=self.cutoff_distance_for_pairs)
36+
self.t1, self.t2, max_=self.cutoff_distance_for_pairs, use_log_scale=self.use_log_scale, log_scale_similarity_threshold=self.log_scale_similarity_threshold)
3537

3638
if _distance is not not_found:
3739
return _distance
@@ -122,7 +124,10 @@ def _precalculate_numpy_arrays_distance(
122124

123125
distances = _get_numpy_array_distance(
124126
pairs_transposed[0], pairs_transposed[1],
125-
max_=self.cutoff_distance_for_pairs)
127+
max_=self.cutoff_distance_for_pairs,
128+
use_log_scale=self.use_log_scale,
129+
log_scale_similarity_threshold=self.log_scale_similarity_threshold,
130+
)
126131

127132
i = 0
128133
for added_hash in hashes_added:
@@ -186,14 +191,19 @@ def _get_item_length(item, parents_ids=frozenset([])):
186191
return length
187192

188193

189-
def _get_numbers_distance(num1, num2, max_=1):
194+
def _get_numbers_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1):
190195
"""
191196
Get the distance of 2 numbers. The output is a number between 0 to the max.
192197
The reason is the
193198
When max is returned means the 2 numbers are really far, and 0 means they are equal.
194199
"""
195200
if num1 == num2:
196201
return 0
202+
if use_log_scale:
203+
distance = logarithmic_distance(num1, num2)
204+
if distance < logarithmic_distance:
205+
return 0
206+
return distance
197207
if not isinstance(num1, float):
198208
num1 = float(num1)
199209
if not isinstance(num2, float):
@@ -218,8 +228,42 @@ def _numpy_div(a, b, replace_inf_with=1):
218228
result[a == b] = 0
219229
return result
220230

231+
# To deal with numbers close to zero
232+
MATH_LOG_OFFSET = 1e-10
233+
234+
def numpy_apply_log_keep_sign(array, offset=MATH_LOG_OFFSET):
235+
# Calculate the absolute value and add the offset
236+
abs_plus_offset = np.abs(array) + offset
237+
238+
# Calculate the logarithm
239+
log_values = np.log(abs_plus_offset)
240+
241+
# Apply the original signs to the log values
242+
signed_log_values = np.copysign(log_values, array)
243+
244+
return signed_log_values
245+
221246

222-
def _get_numpy_array_distance(num1, num2, max_=1):
247+
def logarithmic_similarity(a: numbers, b: numbers, threshold: float=0.1):
248+
"""
249+
A threshold of 0.1 translates to about 10.5% difference.
250+
A threshold of 0.5 translates to about 65% difference.
251+
A threshold of 0.05 translates to about 5.1% difference.
252+
"""
253+
return logarithmic_distance(a, b) < threshold
254+
255+
256+
def logarithmic_distance(a: numbers, b: numbers):
257+
# Apply logarithm to the absolute values and consider the sign
258+
a = float(a)
259+
b = float(b)
260+
log_a = math.copysign(math.log(abs(a) + MATH_LOG_OFFSET), a)
261+
log_b = math.copysign(math.log(abs(b) + MATH_LOG_OFFSET), b)
262+
263+
return abs(log_a - log_b)
264+
265+
266+
def _get_numpy_array_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1):
223267
"""
224268
Get the distance of 2 numbers. The output is a number between 0 to the max.
225269
The reason is the
@@ -229,24 +273,32 @@ def _get_numpy_array_distance(num1, num2, max_=1):
229273
# getting the pairs of items during the ingore_order=True
230274
# calculations, we need to make the divisor of comparison very big
231275
# so that any 2 numbers can be chosen as pairs.
276+
if use_log_scale:
277+
num1 = numpy_apply_log_keep_sign(num1)
278+
num2 = numpy_apply_log_keep_sign(num2)
279+
232280
divisor = (num1 + num2) / max_
233281
result = _numpy_div((num1 - num2), divisor, replace_inf_with=max_)
234-
return np.clip(np.absolute(result), 0, max_)
282+
283+
distance_array = np.clip(np.absolute(result), 0, max_)
284+
if use_log_scale:
285+
distance_array[distance_array < log_scale_similarity_threshold] = 0
286+
return distance_array
235287

236288

237-
def _get_datetime_distance(date1, date2, max_):
289+
def _get_datetime_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold):
238290
return _get_numbers_distance(date1.timestamp(), date2.timestamp(), max_)
239291

240292

241-
def _get_date_distance(date1, date2, max_):
293+
def _get_date_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold):
242294
return _get_numbers_distance(date1.toordinal(), date2.toordinal(), max_)
243295

244296

245-
def _get_timedelta_distance(timedelta1, timedelta2, max_):
297+
def _get_timedelta_distance(timedelta1, timedelta2, max_, use_log_scale, log_scale_similarity_threshold):
246298
return _get_numbers_distance(timedelta1.total_seconds(), timedelta2.total_seconds(), max_)
247299

248300

249-
def _get_time_distance(time1, time2, max_):
301+
def _get_time_distance(time1, time2, max_, use_log_scale, log_scale_similarity_threshold):
250302
return _get_numbers_distance(time_to_seconds(time1), time_to_seconds(time2), max_)
251303

252304

@@ -259,8 +311,8 @@ def _get_time_distance(time1, time2, max_):
259311
]
260312

261313

262-
def get_numeric_types_distance(num1, num2, max_):
314+
def get_numeric_types_distance(num1, num2, max_, use_log_scale=False, log_scale_similarity_threshold=0.1):
263315
for type_, func in TYPES_TO_DIST_FUNC:
264316
if isinstance(num1, type_) and isinstance(num2, type_):
265-
return func(num1, num2, max_)
317+
return func(num1, num2, max_, use_log_scale, log_scale_similarity_threshold)
266318
return not_found

deepdiff/helper.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import sys
22
import re
33
import os
4+
import math
45
import datetime
56
import uuid
67
import logging

tests/test_cache.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ def test_cache_deeply_nested_b(self, nested_b_t1, nested_b_t2, nested_b_result):
7474
'MAX PASS LIMIT REACHED': False,
7575
'MAX DIFF LIMIT REACHED': False
7676
}
77-
assert expected_stats == stats
77+
stats_diff = DeepDiff(expected_stats, stats, use_log_scale=True, log_scale_similarity_threshold=0.15)
78+
assert not stats_diff
7879
assert nested_b_result == diff
7980

8081
diff_of_diff = DeepDiff(nested_b_result, diff.to_dict(), ignore_order=False)

tests/test_delta.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ def test_delta_dict_items_added_retain_order(self):
448448
}
449449
}
450450

451-
diff = DeepDiff(t1, t2)
451+
diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
452452
delta_dict = diff._to_delta_dict()
453453
assert expected_delta_dict == delta_dict
454454
delta = Delta(diff, bidirectional=False, raise_errors=True)
@@ -828,9 +828,9 @@ def compare_func(item1, item2, level=None):
828828
'delta_case14b_threshold_to_diff_deeper': {
829829
't1': picklalbe_obj_without_item,
830830
't2': PicklableClass(11),
831-
'deepdiff_kwargs': {'threshold_to_diff_deeper': 0.33},
831+
'deepdiff_kwargs': {'threshold_to_diff_deeper': 0.5},
832832
'to_delta_kwargs': {},
833-
'expected_delta_dict': {'values_changed': {'root': {'new_value': PicklableClass(11)}}}
833+
'expected_delta_dict': {'attribute_added': {'root.item': 11}}
834834
},
835835
'delta_case15_diffing_simple_numbers': {
836836
't1': 1,

tests/test_diff_text.py

+34-4
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def test_value_change(self):
104104
def test_item_added_and_removed(self):
105105
t1 = {1: 1, 2: 2, 3: [3], 4: 4}
106106
t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6}
107-
ddiff = DeepDiff(t1, t2)
107+
ddiff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
108108
result = {
109109
'dictionary_item_added': ["root[5]", "root[6]"],
110110
'dictionary_item_removed': ["root[4]"],
@@ -1023,7 +1023,7 @@ def test_dictionary_with_string_keys1(self):
10231023
t1 = {"veggie": "carrots"}
10241024
t2 = {"meat": "carrots"}
10251025

1026-
diff = DeepDiff(t1, t2)
1026+
diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
10271027
assert {'dictionary_item_added': ["root['meat']"],
10281028
'dictionary_item_removed': ["root['veggie']"]} == diff
10291029

@@ -1037,9 +1037,12 @@ def test_dictionary_with_string_keys_threshold_to_diff_deeper(self):
10371037
def test_dictionary_with_numeric_keys(self):
10381038
t1 = {Decimal('10.01'): "carrots"}
10391039
t2 = {10.01: "carrots"}
1040-
diff = DeepDiff(t1, t2)
1040+
diff = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
10411041
assert {'dictionary_item_added': ["root[10.01]"], 'dictionary_item_removed': ["root[Decimal('10.01')]"]} == diff
10421042

1043+
diff2 = DeepDiff(t1, t2)
1044+
assert {'values_changed': {'root': {'new_value': {10.01: 'carrots'}, 'old_value': {Decimal('10.01'): 'carrots'}}}} == diff2
1045+
10431046
def test_loop(self):
10441047
class LoopTest:
10451048
def __init__(self, a):
@@ -1331,6 +1334,33 @@ def test_decimal_digits(self, t1, t2, significant_digits, expected_result):
13311334
ddiff = DeepDiff(t1, t2, ignore_numeric_type_changes=True, ignore_string_type_changes=True, significant_digits=significant_digits)
13321335
assert expected_result == ddiff
13331336

1337+
@pytest.mark.parametrize('test_num, t1, t2, log_scale_similarity_threshold, expected', [
1338+
(
1339+
1,
1340+
{'foo': 110, 'bar': 306}, # t1
1341+
{'foo': 140, 'bar': 298}, # t2
1342+
0.01, # threshold
1343+
{'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}, "root['bar']": {'new_value': 298, 'old_value': 306}}}, # expected
1344+
),
1345+
(
1346+
2,
1347+
{'foo': 110, 'bar': 306}, # t1
1348+
{'foo': 140, 'bar': 298}, # t2
1349+
0.1, # threshold
1350+
{'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}}}, # expected
1351+
),
1352+
(
1353+
2,
1354+
{'foo': 110, 'bar': 306}, # t1
1355+
{'foo': 140, 'bar': 298}, # t2
1356+
0.3, # threshold
1357+
{}, # expected
1358+
),
1359+
])
1360+
def test_log_scale(self, test_num, t1, t2, log_scale_similarity_threshold, expected):
1361+
diff = DeepDiff(t1, t2, use_log_scale=True, log_scale_similarity_threshold=log_scale_similarity_threshold)
1362+
assert expected == diff, f"test_log_scale #{test_num} failed."
1363+
13341364
def test_ignore_type_in_groups(self):
13351365
t1 = [1, 2, 3]
13361366
t2 = [1.0, 2.0, 3.0]
@@ -1348,7 +1378,7 @@ def test_ignore_type_in_groups3(self):
13481378
t1 = {Decimal('10.01'): "carrots"}
13491379
t2 = {10.01: "carrots"}
13501380

1351-
diff1 = DeepDiff(t1, t2)
1381+
diff1 = DeepDiff(t1, t2, threshold_to_diff_deeper=0)
13521382

13531383
diff2 = DeepDiff(t1, t2, ignore_numeric_type_changes=True)
13541384

0 commit comments

Comments
 (0)