Skip to content

Commit f658892

Browse files
authored
Improve performance (#38)
Add optimized `readbytes!` method which copies data by chunks instead of byte per byte. Internally, avoid passing a `SubArray` to `readbytes!` as there is currently no optimized method for them: instead, wrap the corresponding memory in an `Array`. Increase the size of the buffer from 100 to 200 bytes, which appears to be a good tradeoff. This makes loading a file about 10 times faster than before.
1 parent c9c34ae commit f658892

File tree

2 files changed

+71
-7
lines changed

2 files changed

+71
-7
lines changed

src/StringEncodings.jl

+32-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ end
2020
using Base.Libc: errno, strerror, E2BIG, EINVAL, EILSEQ
2121

2222
import Base: close, eachline, eof, flush, isreadable, iswritable,
23-
open, readline, readlines, readuntil, show, write, read
23+
open, read, readbytes!, readline, readlines, readuntil, show, write
2424

2525
export StringEncoder, StringDecoder, encode, decode, encodings
2626
export StringEncodingError, OutputBufferError, IConvError
@@ -94,7 +94,7 @@ end
9494

9595
## StringEncoder and StringDecoder common functions
9696

97-
const BUFSIZE = 100
97+
const BUFSIZE = 200
9898

9999
mutable struct StringEncoder{F<:Encoding, T<:Encoding, S<:IO} <: IO
100100
stream::S
@@ -318,7 +318,13 @@ function fill_buffer!(s::StringDecoder)
318318
return i
319319
end
320320

321-
s.inbytesleft[] += readbytes!(s.stream, view(s.inbuf, Int(s.inbytesleft[]+1):BUFSIZE))
321+
# readbytes! performance with SubArray was improved by JuliaLang/julia#36607
322+
@static if VERSION >= v"1.6.0-DEV.438"
323+
inbuf_view = view(s.inbuf, Int(s.inbytesleft[]+1):BUFSIZE)
324+
else
325+
inbuf_view = unsafe_wrap(Array, pointer(s.inbuf, s.inbytesleft[]+1), BUFSIZE)
326+
end
327+
s.inbytesleft[] += readbytes!(s.stream, inbuf_view)
322328
iconv!(s.cd, s.inbuf, s.outbuf, s.inbufptr, s.outbufptr, s.inbytesleft, s.outbytesleft)
323329
end
324330

@@ -328,7 +334,7 @@ end
328334
# data contains only state control sequences which may be converted to nothing)
329335
# 3) if not, reset iconv to initial state, which may generate data
330336
function eof(s::StringDecoder)
331-
length(s.outbuf) - s.outbytesleft[] == s.skip &&
337+
BUFSIZE - s.outbytesleft[] == s.skip &&
332338
fill_buffer!(s) == 0 &&
333339
iconv_reset!(s) == 0
334340
end
@@ -403,6 +409,28 @@ function open(fname::AbstractString, enc::Encoding, mode::AbstractString)
403409
wrap_stream(open(fname, mode), enc)
404410
end
405411

412+
# optimized method adapted from Base but reading as many bytes
413+
# as the buffer contains on each iteration rather than a single one,
414+
# which increases performance dramatically
415+
function readbytes!(s::StringDecoder, b::AbstractArray{UInt8}, nb=length(b))
416+
olb = lb = length(b)
417+
nr = 0
418+
while nr < nb && !eof(s)
419+
nc = min(nb-nr, BUFSIZE - s.outbytesleft[])
420+
if nr+nc > lb
421+
lb = (nr+nc) * 2
422+
resize!(b, lb)
423+
end
424+
copyto!(b, firstindex(b)+nr, s.outbuf, s.skip+1, nc)
425+
s.skip += nc
426+
nr += nc
427+
end
428+
if lb > olb
429+
resize!(b, nr) # shrink to just contain input data if was resized
430+
end
431+
return nr
432+
end
433+
406434
"""
407435
read(stream::IO, [nb::Integer,] enc::Encoding)
408436
read(filename::AbstractString, [nb::Integer,] enc::Encoding)

test/runtests.jl

+39-3
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ mktemp() do path, io
141141
write(io, s)
142142
end
143143

144+
nc = ncodeunits(first(s, 90))
145+
144146
@test String(read(path, enc"ISO-2022-JP")) == s
145147
@test String(open(io->read(io, enc"ISO-2022-JP"), path)) == s
146148
@test String(open(io->read(io), path, enc"ISO-2022-JP")) == s
@@ -149,14 +151,48 @@ mktemp() do path, io
149151
@test String(open(io->read(io, 1000, enc"ISO-2022-JP"), path)) == s
150152
@test String(open(io->read(io, 1000), path, enc"ISO-2022-JP")) == s
151153

152-
@test String(read(path, 10, enc"ISO-2022-JP")) == first(s, 10)
153-
@test String(open(io->read(io, 10, enc"ISO-2022-JP"), path)) == first(s, 10)
154-
@test String(open(io->read(io, 10), path, enc"ISO-2022-JP")) == first(s, 10)
154+
@test String(read(path, nc, enc"ISO-2022-JP")) == first(s, nc)
155+
@test String(open(io->read(io, nc, enc"ISO-2022-JP"), path)) == first(s, nc)
156+
@test String(open(io->read(io, nc), path, enc"ISO-2022-JP")) == first(s, nc)
155157

156158
@test read(path, String, enc"ISO-2022-JP") == s
157159
@test open(io->read(io, String, enc"ISO-2022-JP"), path) == s
158160
@test open(io->read(io, String), path, enc"ISO-2022-JP") == s
159161

162+
b = zeros(UInt8, nc)
163+
@test open(io->read!(io, b), path, enc"ISO-2022-JP") === b
164+
@test String(b) == first(s, 90)
165+
166+
b = zeros(UInt8, nc)
167+
@test open(io->readbytes!(io, b), path, enc"ISO-2022-JP") == ncodeunits(s)
168+
@test String(b) == s
169+
170+
b = zeros(UInt8, 1000)
171+
@test open(io->readbytes!(io, b), path, enc"ISO-2022-JP") == ncodeunits(s)
172+
@test length(b) == 1000
173+
@test String(b[1:ncodeunits(s)]) == s
174+
175+
b = UInt8[]
176+
@test open(io->readbytes!(io, b), path, enc"ISO-2022-JP") == 0
177+
@test length(b) == 0
178+
179+
b = zeros(UInt8, nc)
180+
@test open(io->readbytes!(io, b, nc), path, enc"ISO-2022-JP") == nc
181+
@test String(b) == first(s, 90)
182+
183+
b = zeros(UInt8, 1000)
184+
@test open(io->readbytes!(io, b, nc), path, enc"ISO-2022-JP") == nc
185+
@test length(b) == 1000
186+
@test String(b[1:nc]) == first(s, 90)
187+
188+
b = UInt8[]
189+
@test open(io->readbytes!(io, b, nc), path, enc"ISO-2022-JP") == nc
190+
@test String(b) == first(s, 90)
191+
192+
b = UInt8[]
193+
open(io->while !eof(io); push!(b, read(io, UInt8)) end, path, enc"ISO-2022-JP")
194+
@test String(b) == s
195+
160196
@test readuntil(path, enc"ISO-2022-JP", '\0') == "a string "
161197
@test open(io->readuntil(io, enc"ISO-2022-JP", '\0'), path) == "a string "
162198
@test open(io->readuntil(io, enc"ISO-2022-JP", '\0', keep=true), path) == "a string \0"

0 commit comments

Comments
 (0)