Skip to content

Commit 84fcc41

Browse files
committed
fixes #426, adding docs for
group_by
1 parent ade098a commit 84fcc41

9 files changed

+216
-38
lines changed

README.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,15 @@ Tested on Python 3.7+ and PyPy3.
2323

2424
Please check the [ChangeLog](CHANGELOG.md) file for the detailed information.
2525

26+
DeepDiff 6-6-1
27+
- Fix for [DeepDiff raises decimal exception when using significant digits](https://github.com/seperman/deepdiff/issues/426)
28+
- Introducing group_by_sort_key
29+
- Adding group_by 2D. For example `group_by=['last_name', 'zip_code']`
30+
31+
2632
DeepDiff 6-6-0
2733

28-
- [Serialize To Flat Dicts]()
34+
- [Serialize To Flat Dicts](https://zepworks.com/deepdiff/current/serialization.html#delta-to-flat-dicts-label)
2935
- [NumPy 2.0 compatibility](https://github.com/seperman/deepdiff/pull/422) by [William Jamieson](https://github.com/WilliamJamieson)
3036

3137
DeepDiff 6-5-0

deepdiff/diff.py

+43-14
Original file line numberDiff line numberDiff line change
@@ -1601,35 +1601,64 @@ def _get_view_results(self, view):
16011601
raise ValueError(INVALID_VIEW_MSG.format(view))
16021602
return result
16031603

1604+
@staticmethod
1605+
def _get_key_for_group_by(row, group_by, item_name):
1606+
try:
1607+
return row.pop(group_by)
1608+
except KeyError:
1609+
logger.error("Unable to group {} by {}. The key is missing in {}".format(item_name, group_by, row))
1610+
raise
1611+
16041612
def _group_iterable_to_dict(self, item, group_by, item_name):
16051613
"""
16061614
Convert a list of dictionaries into a dictionary of dictionaries
16071615
where the key is the value of the group_by key in each dictionary.
16081616
"""
1617+
group_by_level2 = None
1618+
if isinstance(group_by, (list, tuple)):
1619+
group_by_level1 = group_by[0]
1620+
if len(group_by) > 1:
1621+
group_by_level2 = group_by[1]
1622+
else:
1623+
group_by_level1 = group_by
16091624
if isinstance(item, Iterable) and not isinstance(item, Mapping):
16101625
result = {}
16111626
item_copy = deepcopy(item)
16121627
for row in item_copy:
16131628
if isinstance(row, Mapping):
1614-
try:
1615-
key = row.pop(group_by)
1616-
except KeyError:
1617-
logger.error("Unable to group {} by {}. The key is missing in {}".format(item_name, group_by, row))
1618-
raise
1619-
if self.group_by_sort_key:
1620-
if key not in result:
1621-
result[key] = []
1622-
if row not in result[key]:
1623-
result[key].append(row)
1629+
key1 = self._get_key_for_group_by(row, group_by_level1, item_name)
1630+
if group_by_level2:
1631+
key2 = self._get_key_for_group_by(row, group_by_level2, item_name)
1632+
if key1 not in result:
1633+
result[key1] = {}
1634+
if self.group_by_sort_key:
1635+
if key2 not in result[key1]:
1636+
result[key1][key2] = []
1637+
result_key1_key2 = result[key1][key2]
1638+
if row not in result_key1_key2:
1639+
result_key1_key2.append(row)
1640+
else:
1641+
result[key1][key2] = row
16241642
else:
1625-
result[key] = row
1643+
if self.group_by_sort_key:
1644+
if key1 not in result:
1645+
result[key1] = []
1646+
if row not in result[key1]:
1647+
result[key1].append(row)
1648+
else:
1649+
result[key1] = row
16261650
else:
1627-
msg = "Unable to group {} by {} since the item {} is not a dictionary.".format(item_name, group_by, row)
1651+
msg = "Unable to group {} by {} since the item {} is not a dictionary.".format(item_name, group_by_level1, row)
16281652
logger.error(msg)
16291653
raise ValueError(msg)
16301654
if self.group_by_sort_key:
1631-
for key, row in result.items():
1632-
row.sort(key=self.group_by_sort_key)
1655+
if group_by_level2:
1656+
for key1, row1 in result.items():
1657+
for key2, row in row1.items():
1658+
row.sort(key=self.group_by_sort_key)
1659+
else:
1660+
for key, row in result.items():
1661+
row.sort(key=self.group_by_sort_key)
16331662
return result
16341663
msg = "Unable to group {} by {}".format(item_name, group_by)
16351664
logger.error(msg)

deepdiff/helper.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import string
99
import time
1010
from ast import literal_eval
11-
from decimal import Decimal, localcontext
11+
from decimal import Decimal, localcontext, InvalidOperation as InvalidDecimalOperation
1212
from collections import namedtuple
1313
from itertools import repeat
1414
from ordered_set import OrderedSet
@@ -394,7 +394,13 @@ def number_to_string(number, significant_digits, number_format_notation="f"):
394394
# Precision = number of integer digits + significant_digits
395395
# Using number//1 to get the integer part of the number
396396
ctx.prec = len(str(abs(number // 1))) + significant_digits
397-
number = number.quantize(Decimal('0.' + '0' * significant_digits))
397+
try:
398+
number = number.quantize(Decimal('0.' + '0' * significant_digits))
399+
except InvalidDecimalOperation:
400+
# Sometimes rounding up causes a higher precision to be needed for the quantize operation
401+
# For example '999.99999999' will become '1000.000000' after quantize
402+
ctx.prec += 1
403+
number = number.quantize(Decimal('0.' + '0' * significant_digits))
398404
elif isinstance(number, only_complex_number):
399405
# Case for complex numbers.
400406
number = number.__class__(

deepdiff/serialization.py

+1
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,7 @@ def _serialize_decimal(value):
537537
JSON_CONVERTOR = {
538538
decimal.Decimal: _serialize_decimal,
539539
ordered_set.OrderedSet: list,
540+
set: list,
540541
type: lambda x: x.__name__,
541542
bytes: lambda x: x.decode('utf-8'),
542543
datetime.datetime: lambda x: x.isoformat(),

docs/basics.rst

+87-2
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,24 @@ Object attribute added:
148148
Group By
149149
--------
150150

151-
group_by can be used when dealing with list of dictionaries to convert them to group them by value defined in group_by. The common use case is when reading data from a flat CSV and primary key is one of the columns in the CSV. We want to use the primary key to group the rows instead of CSV row number.
151+
group_by can be used when dealing with the list of dictionaries. It converts them from lists to a single dictionary with the key defined by group_by. The common use case is when reading data from a flat CSV, and the primary key is one of the columns in the CSV. We want to use the primary key instead of the CSV row number to group the rows. The group_by can do 2D group_by by passing a list of 2 keys.
152152

153-
Example:
153+
For example:
154+
>>> [
155+
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
156+
... {'id': 'BB', 'name': 'James', 'last_name': 'Blue'},
157+
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
158+
... ]
159+
160+
Becomes:
161+
>>> t1 = {
162+
... 'AA': {'name': 'Joe', 'last_name': 'Nobody'},
163+
... 'BB': {'name': 'James', 'last_name': 'Blue'},
164+
... 'CC': {'name': 'Mike', 'last_name': 'Apple'},
165+
... }
166+
167+
168+
With that in mind, let's take a look at the following:
154169
>>> from deepdiff import DeepDiff
155170
>>> t1 = [
156171
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
@@ -187,5 +202,75 @@ Now we use group_by='id':
187202
>>> diff['values_changed'][0].up.up.t1
188203
{'AA': {'name': 'Joe', 'last_name': 'Nobody'}, 'BB': {'name': 'James', 'last_name': 'Blue'}, 'CC': {'name': 'Mike', 'last_name': 'Apple'}}
189204

205+
2D Example:
206+
>>> from pprint import pprint
207+
>>> from deepdiff import DeepDiff
208+
>>>
209+
>>> t1 = [
210+
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
211+
... {'id': 'BB', 'name': 'James', 'last_name': 'Blue'},
212+
... {'id': 'BB', 'name': 'Jimmy', 'last_name': 'Red'},
213+
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
214+
... ]
215+
>>>
216+
>>> t2 = [
217+
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
218+
... {'id': 'BB', 'name': 'James', 'last_name': 'Brown'},
219+
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
220+
... ]
221+
>>>
222+
>>> diff = DeepDiff(t1, t2, group_by=['id', 'name'])
223+
>>> pprint(diff)
224+
{'dictionary_item_removed': [root['BB']['Jimmy']],
225+
'values_changed': {"root['BB']['James']['last_name']": {'new_value': 'Brown',
226+
'old_value': 'Blue'}}}
227+
228+
.. _group_by_sort_key_label:
229+
230+
Group By - Sort Key
231+
-------------------
232+
233+
group_by_sort_key is used to define how dictionaries are sorted if multiple ones fall under one group. When this parameter is used, group_by converts the lists of dictionaries into a dictionary of keys to lists of dictionaries. Then, group_by_sort_key is used to sort between the list.
234+
235+
For example, there are duplicate id values. If we only use group_by='id', one of the dictionaries with id of 'BB' will overwrite the other. However, if we also set group_by_sort_key='name', we keep both dictionaries with the id of 'BB'.
236+
237+
Example:
238+
239+
[{'id': 'AA', 'int_id': 2, 'last_name': 'Nobody', 'name': 'Joe'},
240+
{'id': 'BB', 'int_id': 20, 'last_name': 'Blue', 'name': 'James'},
241+
{'id': 'BB', 'int_id': 3, 'last_name': 'Red', 'name': 'Jimmy'},
242+
{'id': 'CC', 'int_id': 4, 'last_name': 'Apple', 'name': 'Mike'}]
243+
244+
245+
Becomes:
246+
{'AA': [{'int_id': 2, 'last_name': 'Nobody', 'name': 'Joe'}],
247+
'BB': [{'int_id': 20, 'last_name': 'Blue', 'name': 'James'},
248+
{'int_id': 3, 'last_name': 'Red', 'name': 'Jimmy'}],
249+
'CC': [{'int_id': 4, 'last_name': 'Apple', 'name': 'Mike'}]}
250+
251+
252+
Example of using group_by_sort_key
253+
>>> t1 = [
254+
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
255+
... {'id': 'BB', 'name': 'James', 'last_name': 'Blue', 'int_id': 20},
256+
... {'id': 'BB', 'name': 'Jimmy', 'last_name': 'Red', 'int_id': 3},
257+
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
258+
... ]
259+
>>>
260+
>>> t2 = [
261+
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
262+
... {'id': 'BB', 'name': 'James', 'last_name': 'Brown', 'int_id': 20},
263+
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
264+
... ]
265+
>>>
266+
>>> diff = DeepDiff(t1, t2, group_by='id', group_by_sort_key='name')
267+
>>>
268+
>>> pprint(diff)
269+
{'iterable_item_removed': {"root['BB'][1]": {'int_id': 3,
270+
'last_name': 'Red',
271+
'name': 'Jimmy'}},
272+
'values_changed': {"root['BB'][0]['last_name']": {'new_value': 'Brown',
273+
'old_value': 'Blue'}}}
274+
190275

191276
Back to :doc:`/index`

docs/diff_doc.rst

+5-2
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,11 @@ include_obj_callback_strict: function, default = None
7979
get_deep_distance: Boolean, default = False
8080
:ref:`get_deep_distance_label` will get you the deep distance between objects. The distance is a number between 0 and 1 where zero means there is no diff between the 2 objects and 1 means they are very different. Note that this number should only be used to compare the similarity of 2 objects and nothing more. The algorithm for calculating this number may or may not change in the future releases of DeepDiff.
8181

82-
group_by: String, default=None
83-
:ref:`group_by_label` can be used when dealing with list of dictionaries to convert them to group them by value defined in group_by. The common use case is when reading data from a flat CSV and primary key is one of the columns in the CSV. We want to use the primary key to group the rows instead of CSV row number.
82+
group_by: String or a list of size 2, default=None
83+
:ref:`group_by_label` can be used when dealing with the list of dictionaries. It converts them from lists to a single dictionary with the key defined by group_by. The common use case is when reading data from a flat CSV, and the primary key is one of the columns in the CSV. We want to use the primary key instead of the CSV row number to group the rows. The group_by can do 2D group_by by passing a list of 2 keys.
84+
85+
group_by_sort_key: String or a function
86+
:ref:`group_by_sort_key_label` is used to define how dictionaries are sorted if multiple ones fall under one group. When this parameter is used, group_by converts the lists of dictionaries into a dictionary of keys to lists of dictionaries. Then, :ref:`group_by_sort_key_label` is used to sort between the list.
8487

8588
hasher: default = DeepHash.sha256hex
8689
Hash function to be used. If you don't want SHA256, you can use your own hash function

docs/index.rst

+9
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@ The DeepDiff library includes the following modules:
3131
What Is New
3232
***********
3333

34+
DeepDiff 6-6-1
35+
--------------
36+
37+
- Fix for `DeepDiff raises decimal exception when using significant
38+
digits <https://github.com/seperman/deepdiff/issues/426>`__
39+
- Introducing group_by_sort_key
40+
- Adding group_by 2D. For example
41+
``group_by=['last_name', 'zip_code']``
42+
3443
DeepDiff 6-6-0
3544
--------------
3645

tests/test_diff_text.py

+47-11
Original file line numberDiff line numberDiff line change
@@ -1144,18 +1144,19 @@ def test_int_to_unicode(self):
11441144
}
11451145
assert result == ddiff
11461146

1147-
@pytest.mark.parametrize("t1, t2, ignore_numeric_type_changes, significant_digits, number_format_notation, result", [
1148-
(43.265798602382986, 43.71677762295505, False, 0, "f", {'values_changed': {'root': {'new_value': 43.71677762295505, 'old_value': 43.265798602382986}}}), # Note that it rounds the number so one becomes 43 and the other one is 44
1149-
(Decimal('2.5'), Decimal('1.5'), False, 0, "f", {}),
1150-
(Decimal('2.5'), Decimal('1.5'), False, 1, "f", {'values_changed': {'root': {'new_value': Decimal('1.5'), 'old_value': Decimal('2.5')}}}),
1151-
(Decimal('2.5'), Decimal(2.5), False, 3, "f", {}),
1152-
(1024, 1022, False, 2, "e", {}),
1153-
({"key": [Decimal('2.0001'), Decimal('20000.0001')]}, {"key": [2.0002, 20000.0002]}, True, 4, "e", {'values_changed': {"root['key'][0]": {'new_value': 2.0002, 'old_value': Decimal('2.0001')}}})
1147+
@pytest.mark.parametrize("test_num, t1, t2, ignore_numeric_type_changes, significant_digits, number_format_notation, result", [
1148+
(1, 43.265798602382986, 43.71677762295505, False, 0, "f", {'values_changed': {'root': {'new_value': 43.71677762295505, 'old_value': 43.265798602382986}}}), # Note that it rounds the number so one becomes 43 and the other one is 44
1149+
(2, Decimal('2.5'), Decimal('1.5'), False, 0, "f", {}),
1150+
(3, Decimal('2.5'), Decimal('1.5'), False, 1, "f", {'values_changed': {'root': {'new_value': Decimal('1.5'), 'old_value': Decimal('2.5')}}}),
1151+
(4, Decimal('2.5'), Decimal(2.5), False, 3, "f", {}),
1152+
(5, 1024, 1022, False, 2, "e", {}),
1153+
(6, {"key": [Decimal('2.0001'), Decimal('20000.0001')]}, {"key": [2.0002, 20000.0002]}, True, 4, "e", {'values_changed': {"root['key'][0]": {'new_value': 2.0002, 'old_value': Decimal('2.0001')}}}),
1154+
(7, [Decimal("999.99999999")], [Decimal("999.9999999999")], False, 6, "f", {}),
11541155
])
1155-
def test_significant_digits_and_notation(self, t1, t2, ignore_numeric_type_changes, significant_digits, number_format_notation, result):
1156+
def test_significant_digits_and_notation(self, test_num, t1, t2, ignore_numeric_type_changes, significant_digits, number_format_notation, result):
11561157
ddiff = DeepDiff(t1, t2, significant_digits=significant_digits, number_format_notation=number_format_notation,
11571158
ignore_numeric_type_changes=ignore_numeric_type_changes)
1158-
assert result == ddiff
1159+
assert result == ddiff, f"test_significant_digits_and_notation #{test_num} failed."
11591160

11601161
def test_significant_digits_for_complex_imaginary_part(self):
11611162
t1 = 1.23 + 1.222254j
@@ -1745,8 +1746,43 @@ def test_group_by2_when_repeats(self):
17451746
diff2 = DeepDiff(t1, t2, group_by='id', group_by_sort_key=lambda x: x['name'])
17461747
assert expected_grouped == diff2
17471748

1748-
diff3 = DeepDiff(t1, t2, group_by='id', group_by_sort_key=lambda x: x['name'])
1749-
assert expected_grouped == diff3
1749+
def test_group_by3_when_repeats_and_group_by_list(self):
1750+
t1 = [
1751+
{'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
1752+
{'id': 'BB', 'name': 'James', 'last_name': 'Blue', 'int_id': 20},
1753+
{'id': 'BB', 'name': 'Jimmy', 'last_name': 'Red', 'int_id': 3},
1754+
{'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
1755+
]
1756+
1757+
t2 = [
1758+
{'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
1759+
{'id': 'BB', 'name': 'James', 'last_name': 'Brown', 'int_id': 20},
1760+
{'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
1761+
]
1762+
1763+
diff1 = DeepDiff(t1, t2, group_by=['id', 'name'])
1764+
expected_grouped = {
1765+
'dictionary_item_removed': ["root['BB']['Jimmy']"],
1766+
'values_changed': {
1767+
"root['BB']['James']['last_name']": {
1768+
'new_value': 'Brown',
1769+
'old_value': 'Blue'
1770+
}
1771+
}
1772+
}
1773+
assert expected_grouped == diff1
1774+
1775+
diff2 = DeepDiff(t1, t2, group_by=['id', 'name'], group_by_sort_key='int_id')
1776+
expected_grouped = {
1777+
'dictionary_item_removed': ["root['BB']['Jimmy']"],
1778+
'values_changed': {
1779+
"root['BB']['James'][0]['last_name']": {
1780+
'new_value': 'Brown',
1781+
'old_value': 'Blue'
1782+
}
1783+
}
1784+
}
1785+
assert expected_grouped == diff2
17501786

17511787
def test_group_by_key_missing(self):
17521788
t1 = [

tests/test_serialization.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -315,13 +315,16 @@ def test_pretty_form_method(self, expected, verbose_level):
315315
result = ddiff.pretty()
316316
assert result == expected
317317

318-
@pytest.mark.parametrize('test_num, value', [
319-
(1, {'10': None}),
320-
(2, {"type_changes": {"root": {"old_type": None, "new_type": list, "new_value": ["你好", 2, 3, 5]}}}),
321-
(3, {'10': Decimal(2017)}),
322-
(4, Decimal(2017.1)),
318+
@pytest.mark.parametrize('test_num, value, func_to_convert_back', [
319+
(1, {'10': None}, None),
320+
(2, {"type_changes": {"root": {"old_type": None, "new_type": list, "new_value": ["你好", 2, 3, 5]}}}, None),
321+
(3, {'10': Decimal(2017)}, None),
322+
(4, Decimal(2017.1), None),
323+
(5, {1, 2, 10}, set),
323324
])
324-
def test_json_dumps_and_loads(self, test_num, value):
325+
def test_json_dumps_and_loads(self, test_num, value, func_to_convert_back):
325326
serialized = json_dumps(value)
326327
back = json_loads(serialized)
328+
if func_to_convert_back:
329+
back = func_to_convert_back(back)
327330
assert value == back, f"test_json_dumps_and_loads test #{test_num} failed"

0 commit comments

Comments
 (0)