Skip to content

Commit c748078

Browse files
authored
Merge pull request #2775 from sparklemotion/2773-pseudo-io-serialization
fix: serialization with pseudo-IO objects like Zip::OutputStream
2 parents 1605431 + 952ff44 commit c748078

File tree

4 files changed

+93
-18
lines changed

4 files changed

+93
-18
lines changed

CHANGELOG.md

+9
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,15 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
44

55
---
66

7+
## 1.14.1 / unreleased
8+
9+
### Fixed
10+
11+
* Serializing documents now works again with pseudo-IO objects that don't support IO's encoding API (like rubyzip's `Zip::OutputStream`). This was a regression in v1.14.0 due to the fix for [#752](https://github.com/sparklemotion/nokogiri/issues/752) in [#2434](https://github.com/sparklemotion/nokogiri/issues/2434), and was not completely fixed by [#2753](https://github.com/sparklemotion/nokogiri/issues/2753). [[#2773](https://github.com/sparklemotion/nokogiri/issues/2773)]
12+
13+
2e260f53e6b84b8f9c1b115b0ded85eebc8155d7
14+
15+
716
## 1.14.0 / 2023-01-12
817

918
### Notable Changes

Gemfile

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ group :development do
2424
gem "minitest-reporters", "= 1.5.0"
2525
gem "ruby_memcheck", "1.2.0" unless RUBY_PLATFORM == "java"
2626
gem "simplecov", "= 0.21.2"
27+
gem "rubyzip", "~> 2.3.2"
2728

2829
# rubocop
2930
if Gem::Requirement.new("~> 3.0").satisfied_by?(Gem::Version.new(RUBY_VERSION))

ext/nokogiri/nokogiri.c

+7-2
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,13 @@ noko_io_write(void *io, char *c_buffer, int c_buffer_len)
112112
{
113113
VALUE rb_args[2], rb_n_bytes_written;
114114
VALUE rb_io = (VALUE)io;
115-
VALUE rb_enc = rb_funcall(rb_io, id_external_encoding, 0);
116-
rb_encoding *io_encoding = RB_NIL_P(rb_enc) ? rb_ascii8bit_encoding() : rb_to_encoding(rb_enc);
115+
VALUE rb_enc = Qnil;
116+
rb_encoding *io_encoding;
117+
118+
if (rb_respond_to(rb_io, id_external_encoding)) {
119+
rb_enc = rb_funcall(rb_io, id_external_encoding, 0);
120+
}
121+
io_encoding = RB_NIL_P(rb_enc) ? rb_ascii8bit_encoding() : rb_to_encoding(rb_enc);
117122

118123
rb_args[0] = rb_io;
119124
rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);

test/xml/test_document_encoding.rb

+76-16
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ class TestDocumentEncoding < Nokogiri::TestCase
88
describe "Nokogiri::XML::Document encoding" do
99
let(:shift_jis_document) { Nokogiri::XML(File.read(SHIFT_JIS_XML), SHIFT_JIS_XML) }
1010
let(:ascii_document) { Nokogiri::XML.parse(File.read(XML_FILE), XML_FILE) }
11+
let(:utf16_document) do
12+
# the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
13+
# is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
14+
size = 8000
15+
<<~XML.encode(Encoding::UTF_16)
16+
<?xml version="1.0" encoding="UTF-16"?>
17+
<root>
18+
<bar>#{"A" * size}</bar>
19+
</root>
20+
XML
21+
end
1122

1223
describe "#encoding" do
1324
it "describes the document's encoding correctly" do
@@ -31,36 +42,85 @@ class TestDocumentEncoding < Nokogiri::TestCase
3142
end
3243

3344
it "encodes the URL as UTF-8" do
34-
assert_equal("UTF-8", shift_jis_document.url.encoding.name)
45+
assert_equal(Encoding::UTF_8, shift_jis_document.url.encoding)
3546
end
3647

3748
it "encodes the encoding name as UTF-8" do
38-
assert_equal("UTF-8", shift_jis_document.encoding.encoding.name)
49+
assert_equal(Encoding::UTF_8, shift_jis_document.encoding.encoding)
3950
end
4051

4152
it "encodes the library versions as UTF-8" do
4253
skip_unless_libxml2
43-
assert_equal("UTF-8", Nokogiri::LIBXML_COMPILED_VERSION.encoding.name)
44-
assert_equal("UTF-8", Nokogiri::LIBXSLT_COMPILED_VERSION.encoding.name)
54+
55+
assert_equal(Encoding::UTF_8, Nokogiri::LIBXML_COMPILED_VERSION.encoding)
56+
assert_equal(Encoding::UTF_8, Nokogiri::LIBXSLT_COMPILED_VERSION.encoding)
57+
end
58+
59+
it "parses and serializes UTF-16 correctly" do
60+
xml = <<~XML.encode(Encoding::UTF_16)
61+
<?xml version="1.0" encoding="UTF-16"?>
62+
<root><bar>A</bar></root>
63+
XML
64+
output = Nokogiri::XML(xml).to_xml
65+
output_doc = Nokogiri::XML(output)
66+
67+
# these are descriptive, not prescriptive. the difference is whitespace. this may change
68+
# as implementations change. the intention is to verify that they're _roughly_ the right
69+
# length, they're not zero or half-width or double-width.
70+
expected_bytesize = Nokogiri.jruby? ? 132 : 142
71+
72+
assert_equal(Encoding::UTF_16, output.encoding)
73+
assert_equal("UTF-16", output_doc.encoding)
74+
assert_equal(expected_bytesize, output.bytesize)
75+
output_doc.at_xpath("/root/bar/text()").tap do |node|
76+
assert(node, "unexpected DOM structure in #{output.inspect}")
77+
assert_equal("A", node.content)
78+
end
4579
end
4680

4781
it "serializes UTF-16 correctly across libxml2 buffer flushes" do
4882
# https://github.com/sparklemotion/nokogiri/issues/752
4983
skip_unless_libxml2
5084

51-
# the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
52-
# is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
53-
size = 8000
54-
input = String.new(<<~XML, encoding: "UTF-16")
55-
<?xml version="1.0" encoding="UTF-16"?>
56-
<root>
57-
<bar>#{"A" * size}</bar>
58-
</root>
59-
XML
60-
expected_length = (input.bytesize * 2) + 2 # double character width, add BOM bytes 0xFEFF
85+
output = Nokogiri::XML(utf16_document).to_xml
6186

62-
output = Nokogiri::XML(input).to_xml
63-
assert_equal(expected_length, output.bytesize)
87+
assert_equal(Encoding::UTF_16, output.encoding)
88+
assert_equal(utf16_document.bytesize, output.bytesize)
89+
end
90+
91+
describe "pseudo-IO" do
92+
it "serializes correctly with Zip::OutputStream objects" do
93+
# https://github.com/sparklemotion/nokogiri/issues/2773
94+
require "zip"
95+
96+
xml = <<~XML
97+
<?xml version="1.0" encoding="UTF-8"?>
98+
<root>
99+
<bar>A</bar>
100+
</root>
101+
XML
102+
103+
Dir.mktmpdir do |tmpdir|
104+
zipfile_path = File.join(tmpdir, "test.zip")
105+
106+
Zip::OutputStream.open(zipfile_path) do |io|
107+
io.put_next_entry("test-utf8.xml")
108+
Nokogiri::XML(xml).write_to(io, encoding: "UTF-8")
109+
end
110+
111+
Zip::InputStream.open(zipfile_path) do |io|
112+
entry = io.get_next_entry
113+
assert_equal("test-utf8.xml", entry.name)
114+
output = io.read
115+
116+
# no final newline on jruby. descriptive, not prescriptive.
117+
expected_length = Nokogiri.jruby? ? xml.bytesize - 1 : xml.bytesize
118+
119+
assert_equal(Encoding::UTF_8, output.encoding)
120+
assert_equal(expected_length, output.bytesize)
121+
end
122+
end
123+
end
64124
end
65125
end
66126
end

0 commit comments

Comments
 (0)