@@ -8,6 +8,17 @@ class TestDocumentEncoding < Nokogiri::TestCase
8
8
describe "Nokogiri::XML::Document encoding" do
9
9
let ( :shift_jis_document ) { Nokogiri ::XML ( File . read ( SHIFT_JIS_XML ) , SHIFT_JIS_XML ) }
10
10
let ( :ascii_document ) { Nokogiri ::XML . parse ( File . read ( XML_FILE ) , XML_FILE ) }
11
+ let ( :utf16_document ) do
12
+ # the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
13
+ # is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
14
+ size = 8000
15
+ <<~XML . encode ( Encoding ::UTF_16 )
16
+ <?xml version="1.0" encoding="UTF-16"?>
17
+ <root>
18
+ <bar>#{ "A" * size } </bar>
19
+ </root>
20
+ XML
21
+ end
11
22
12
23
describe "#encoding" do
13
24
it "describes the document's encoding correctly" do
@@ -31,36 +42,85 @@ class TestDocumentEncoding < Nokogiri::TestCase
31
42
end
32
43
33
44
it "encodes the URL as UTF-8" do
34
- assert_equal ( "UTF-8" , shift_jis_document . url . encoding . name )
45
+ assert_equal ( Encoding :: UTF_8 , shift_jis_document . url . encoding )
35
46
end
36
47
37
48
it "encodes the encoding name as UTF-8" do
38
- assert_equal ( "UTF-8" , shift_jis_document . encoding . encoding . name )
49
+ assert_equal ( Encoding :: UTF_8 , shift_jis_document . encoding . encoding )
39
50
end
40
51
41
52
it "encodes the library versions as UTF-8" do
42
53
skip_unless_libxml2
43
- assert_equal ( "UTF-8" , Nokogiri ::LIBXML_COMPILED_VERSION . encoding . name )
44
- assert_equal ( "UTF-8" , Nokogiri ::LIBXSLT_COMPILED_VERSION . encoding . name )
54
+
55
+ assert_equal ( Encoding ::UTF_8 , Nokogiri ::LIBXML_COMPILED_VERSION . encoding )
56
+ assert_equal ( Encoding ::UTF_8 , Nokogiri ::LIBXSLT_COMPILED_VERSION . encoding )
57
+ end
58
+
59
+ it "parses and serializes UTF-16 correctly" do
60
+ xml = <<~XML . encode ( Encoding ::UTF_16 )
61
+ <?xml version="1.0" encoding="UTF-16"?>
62
+ <root><bar>A</bar></root>
63
+ XML
64
+ output = Nokogiri ::XML ( xml ) . to_xml
65
+ output_doc = Nokogiri ::XML ( output )
66
+
67
+ # these are descriptive, not prescriptive. the difference is whitespace. this may change
68
+ # as implementations change. the intention is to verify that they're _roughly_ the right
69
+ # length, they're not zero or half-width or double-width.
70
+ expected_bytesize = Nokogiri . jruby? ? 132 : 142
71
+
72
+ assert_equal ( Encoding ::UTF_16 , output . encoding )
73
+ assert_equal ( "UTF-16" , output_doc . encoding )
74
+ assert_equal ( expected_bytesize , output . bytesize )
75
+ output_doc . at_xpath ( "/root/bar/text()" ) . tap do |node |
76
+ assert ( node , "unexpected DOM structure in #{ output . inspect } " )
77
+ assert_equal ( "A" , node . content )
78
+ end
45
79
end
46
80
47
81
it "serializes UTF-16 correctly across libxml2 buffer flushes" do
48
82
# https://github.com/sparklemotion/nokogiri/issues/752
49
83
skip_unless_libxml2
50
84
51
- # the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
52
- # is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
53
- size = 8000
54
- input = String . new ( <<~XML , encoding : "UTF-16" )
55
- <?xml version="1.0" encoding="UTF-16"?>
56
- <root>
57
- <bar>#{ "A" * size } </bar>
58
- </root>
59
- XML
60
- expected_length = ( input . bytesize * 2 ) + 2 # double character width, add BOM bytes 0xFEFF
85
+ output = Nokogiri ::XML ( utf16_document ) . to_xml
61
86
62
- output = Nokogiri ::XML ( input ) . to_xml
63
- assert_equal ( expected_length , output . bytesize )
87
+ assert_equal ( Encoding ::UTF_16 , output . encoding )
88
+ assert_equal ( utf16_document . bytesize , output . bytesize )
89
+ end
90
+
91
+ describe "pseudo-IO" do
92
+ it "serializes correctly with Zip::OutputStream objects" do
93
+ # https://github.com/sparklemotion/nokogiri/issues/2773
94
+ require "zip"
95
+
96
+ xml = <<~XML
97
+ <?xml version="1.0" encoding="UTF-8"?>
98
+ <root>
99
+ <bar>A</bar>
100
+ </root>
101
+ XML
102
+
103
+ Dir . mktmpdir do |tmpdir |
104
+ zipfile_path = File . join ( tmpdir , "test.zip" )
105
+
106
+ Zip ::OutputStream . open ( zipfile_path ) do |io |
107
+ io . put_next_entry ( "test-utf8.xml" )
108
+ Nokogiri ::XML ( xml ) . write_to ( io , encoding : "UTF-8" )
109
+ end
110
+
111
+ Zip ::InputStream . open ( zipfile_path ) do |io |
112
+ entry = io . get_next_entry
113
+ assert_equal ( "test-utf8.xml" , entry . name )
114
+ output = io . read
115
+
116
+ # no final newline on jruby. descriptive, not prescriptive.
117
+ expected_length = Nokogiri . jruby? ? xml . bytesize - 1 : xml . bytesize
118
+
119
+ assert_equal ( Encoding ::UTF_8 , output . encoding )
120
+ assert_equal ( expected_length , output . bytesize )
121
+ end
122
+ end
123
+ end
64
124
end
65
125
end
66
126
end
0 commit comments