Skip to content

Commit bfc88d3

Browse files
[3.9] gh-91810: ElementTree: Use text file's encoding by default in XML declaration (GH-91903) (GH-92665)
ElementTree method write() and function tostring() now use the text file's encoding ("UTF-8" if not available) instead of locale encoding in XML declaration when encoding="unicode" is specified. (cherry picked from commit 707839b) Co-authored-by: Serhiy Storchaka <[email protected]> Automerge-Triggered-By: GH:serhiy-storchaka
1 parent 3f2113d commit bfc88d3

File tree

3 files changed

+29
-30
lines changed

3 files changed

+29
-30
lines changed

Lib/test/test_xml_etree.py

+15-16
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import html
1111
import io
1212
import itertools
13-
import locale
1413
import operator
1514
import os
1615
import pickle
@@ -960,15 +959,13 @@ def test_tostring_xml_declaration(self):
960959

961960
def test_tostring_xml_declaration_unicode_encoding(self):
962961
elem = ET.XML('<body><tag/></body>')
963-
preferredencoding = locale.getpreferredencoding()
964962
self.assertEqual(
965-
f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>",
966-
ET.tostring(elem, encoding='unicode', xml_declaration=True)
963+
ET.tostring(elem, encoding='unicode', xml_declaration=True),
964+
"<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
967965
)
968966

969967
def test_tostring_xml_declaration_cases(self):
970968
elem = ET.XML('<body><tag>ø</tag></body>')
971-
preferredencoding = locale.getpreferredencoding()
972969
TESTCASES = [
973970
# (expected_retval, encoding, xml_declaration)
974971
# ... xml_declaration = None
@@ -995,7 +992,7 @@ def test_tostring_xml_declaration_cases(self):
995992
b"<body><tag>&#248;</tag></body>", 'US-ASCII', True),
996993
(b"<?xml version='1.0' encoding='ISO-8859-1'?>\n"
997994
b"<body><tag>\xf8</tag></body>", 'ISO-8859-1', True),
998-
(f"<?xml version='1.0' encoding='{preferredencoding}'?>\n"
995+
("<?xml version='1.0' encoding='utf-8'?>\n"
999996
"<body><tag>ø</tag></body>", 'unicode', True),
1000997

1001998
]
@@ -1033,11 +1030,10 @@ def test_tostringlist_xml_declaration(self):
10331030
b"<?xml version='1.0' encoding='us-ascii'?>\n<body><tag /></body>"
10341031
)
10351032

1036-
preferredencoding = locale.getpreferredencoding()
10371033
stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True)
10381034
self.assertEqual(
10391035
''.join(stringlist),
1040-
f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>"
1036+
"<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
10411037
)
10421038
self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>")
10431039
self.assertEqual(['<body', '>', '<tag', ' />', '</body>'], stringlist[1:])
@@ -3681,17 +3677,16 @@ def test_write_to_filename_as_unicode(self):
36813677
encoding = f.encoding
36823678
support.unlink(TESTFN)
36833679

3684-
try:
3685-
'\xf8'.encode(encoding)
3686-
except UnicodeEncodeError:
3687-
self.skipTest(f'default file encoding {encoding} not supported')
3688-
36893680
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
36903681
tree.write(TESTFN, encoding='unicode')
36913682
with open(TESTFN, 'rb') as f:
36923683
data = f.read()
36933684
expected = "<site>\xf8</site>".encode(encoding, 'xmlcharrefreplace')
3694-
self.assertEqual(data, expected)
3685+
if encoding.lower() in ('utf-8', 'ascii'):
3686+
self.assertEqual(data, expected)
3687+
else:
3688+
self.assertIn(b"<?xml version='1.0' encoding=", data)
3689+
self.assertIn(expected, data)
36953690

36963691
def test_write_to_text_file(self):
36973692
self.addCleanup(support.unlink, TESTFN)
@@ -3706,13 +3701,17 @@ def test_write_to_text_file(self):
37063701
tree.write(f, encoding='unicode')
37073702
self.assertFalse(f.closed)
37083703
with open(TESTFN, 'rb') as f:
3709-
self.assertEqual(f.read(), b'''<site>&#248;</site>''')
3704+
self.assertEqual(f.read(), convlinesep(
3705+
b'''<?xml version='1.0' encoding='ascii'?>\n'''
3706+
b'''<site>&#248;</site>'''))
37103707

37113708
with open(TESTFN, 'w', encoding='ISO-8859-1') as f:
37123709
tree.write(f, encoding='unicode')
37133710
self.assertFalse(f.closed)
37143711
with open(TESTFN, 'rb') as f:
3715-
self.assertEqual(f.read(), b'''<site>\xf8</site>''')
3712+
self.assertEqual(f.read(), convlinesep(
3713+
b'''<?xml version='1.0' encoding='ISO-8859-1'?>\n'''
3714+
b'''<site>\xf8</site>'''))
37163715

37173716
def test_write_to_binary_file(self):
37183717
self.addCleanup(support.unlink, TESTFN)

Lib/xml/etree/ElementTree.py

+9-14
Original file line numberDiff line numberDiff line change
@@ -728,16 +728,10 @@ def write(self, file_or_filename,
728728
encoding = "utf-8"
729729
else:
730730
encoding = "us-ascii"
731-
enc_lower = encoding.lower()
732-
with _get_writer(file_or_filename, enc_lower) as write:
731+
with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
733732
if method == "xml" and (xml_declaration or
734733
(xml_declaration is None and
735-
enc_lower not in ("utf-8", "us-ascii", "unicode"))):
736-
declared_encoding = encoding
737-
if enc_lower == "unicode":
738-
# Retrieve the default encoding for the xml declaration
739-
import locale
740-
declared_encoding = locale.getpreferredencoding()
734+
declared_encoding.lower() not in ("utf-8", "us-ascii"))):
741735
write("<?xml version='1.0' encoding='%s'?>\n" % (
742736
declared_encoding,))
743737
if method == "text":
@@ -762,19 +756,20 @@ def _get_writer(file_or_filename, encoding):
762756
write = file_or_filename.write
763757
except AttributeError:
764758
# file_or_filename is a file name
765-
if encoding == "unicode":
766-
file = open(file_or_filename, "w")
759+
if encoding.lower() == "unicode":
760+
file = open(file_or_filename, "w",
761+
errors="xmlcharrefreplace")
767762
else:
768763
file = open(file_or_filename, "w", encoding=encoding,
769764
errors="xmlcharrefreplace")
770765
with file:
771-
yield file.write
766+
yield file.write, file.encoding
772767
else:
773768
# file_or_filename is a file-like object
774769
# encoding determines if it is a text or binary writer
775-
if encoding == "unicode":
770+
if encoding.lower() == "unicode":
776771
# use a text writer as is
777-
yield write
772+
yield write, getattr(file_or_filename, "encoding", None) or "utf-8"
778773
else:
779774
# wrap a binary writer with TextIOWrapper
780775
with contextlib.ExitStack() as stack:
@@ -805,7 +800,7 @@ def _get_writer(file_or_filename, encoding):
805800
# Keep the original file open when the TextIOWrapper is
806801
# destroyed
807802
stack.callback(file.detach)
808-
yield file.write
803+
yield file.write, encoding
809804

810805
def _namespaces(elem, default_namespace=None):
811806
# identify namespaces used in this tree
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
:class:`~xml.etree.ElementTree.ElementTree` method
2+
:meth:`~xml.etree.ElementTree.ElementTree.write` and function
3+
:func:`~xml.etree.ElementTree.tostring` now use the text file's encoding
4+
("UTF-8" if not available) instead of locale encoding in XML declaration
5+
when ``encoding="unicode"`` is specified.

0 commit comments

Comments
 (0)