Skip to content

Commit c55d636

Browse files
authored
Merge pull request #38 from JuliaString/spj/codeunits
Optimize codeunits, ==, cmp
2 parents d2ff885 + 98326ab commit c55d636

File tree

3 files changed

+117
-45
lines changed

3 files changed

+117
-45
lines changed

Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name = "ShortStrings"
22
uuid = "63221d1c-8677-4ff0-9126-0ff0817b4975"
33
authors = ["Dai ZJ <[email protected]>", "ScottPJones <[email protected]>",
44
"Lyndon White <[email protected]>"]
5-
version = "0.3.3"
5+
version = "0.3.4"
66

77
[deps]
88
BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"

src/base.jl

+106-44
Original file line numberDiff line numberDiff line change
@@ -2,34 +2,45 @@
22

33
using BitIntegers: @define_integers
44

5-
import Base: unsafe_getindex, ==, show, promote_rule
5+
import Base: unsafe_getindex, ==, cmp, promote_rule
66
using Base: @_inline_meta, @propagate_inbounds, @_propagate_inbounds_meta
77
import Base.GC: @preserve
88

9+
"""
10+
Type for holding short, fixed maximum size strings efficiently
11+
"""
912
struct ShortString{T} <: AbstractString where {T}
1013
size_content::T
1114
end
1215

13-
"""Check if a string of size `sz` can be stored in ShortString{T}"""
14-
function check_size(T, sz)
15-
max_len = sizeof(T) - size_bytes(T) # the last few bytes are used to store the length
16-
if sz > max_len
17-
throw(ErrorException("sizeof(::$T) must be shorter than or equal to $(max_len) in length; you have supplied a string of size $sz"))
18-
end
19-
end
16+
"""The size of the chunk used to process String values"""
17+
const CHUNKSZ = sizeof(UInt)
18+
19+
"""Mask used for alignment"""
20+
const CHUNKMSK = (CHUNKSZ-1)%UInt
21+
22+
"""The number of bits in the chunk type used to process String values"""
23+
const CHUNKBITS = sizeof(UInt) == 4 ? 32 : 64
2024

2125
"""Calculate the number of bytes required to store the size of the ShortString"""
2226
size_bytes(::Type{T}) where {T} = (count_ones(sizeof(T)-1)+7)>>3
2327

28+
"""Calculate the maximum length in bytes that can be stored in this ShortString"""
29+
max_len(T) = sizeof(T) - size_bytes(T)
30+
31+
"""Check if a string of size `sz` can be stored in ShortString{T}"""
32+
@inline function check_size(T, sz)
33+
maxlen = max_len(T)
34+
sz > maxlen &&
35+
throw(ErrorException("sizeof(::$T) must be shorter than or equal to $(maxlen) in length; you have supplied a string of size $sz"))
36+
end
37+
2438
"""Calculate a mask to get the size stored in the ShortString"""
2539
size_mask(T) = T((1<<(size_bytes(T)*8)) - 1)
2640
size_mask(s::ShortString{T}) where {T} = size_mask(T)
2741

28-
"""The size of the chunk used to process String values"""
29-
const CHUNKSZ = sizeof(UInt)
30-
31-
"""The number of bits in the chunk type used to process String values"""
32-
const CHUNKBITS = sizeof(UInt) == 4 ? 32 : 64
42+
"""Get the contents of the ShortString without the size, in native order"""
43+
_swapped_str(s::ShortString) = ntoh(s.size_content & ~size_mask(s))
3344

3445
"""Internal function to pick up a byte at the given index in a ShortString"""
3546
@inline _get_byte(s::ShortString, i::Int) = (s.size_content >>> (8*(sizeof(s) - i)))%UInt8
@@ -105,8 +116,7 @@ function ShortString{T}(s::ShortString{S}) where {T, S}
105116
# size_mask(S) will return a mask for getting the size for Shorting Strings in (content size)
106117
# format, so something like 00001111 in binary.
107118
# ~size_mask(S) will yield 11110000 which can be used as a mask to extract the content
108-
content = ntoh(T(ntoh(s.size_content & ~size_mask(S))))
109-
ShortString{T}(content | T(sz))
119+
ShortString{T}(ntoh(T(_swapped_str(s))) | T(sz))
110120
end
111121

112122
"""Amount to shift ShortString value by for each UInt sized chunk"""
@@ -129,16 +139,16 @@ function String(s::ShortString{T}) where {T}
129139
end
130140

131141
Base.codeunit(s::ShortString) = UInt8
132-
Base.codeunit(s::ShortString, i) = codeunits(String(s), i)
133-
Base.codeunit(s::ShortString, i::Integer) = codeunit(String(s), i)
134-
Base.codeunits(s::ShortString) = codeunits(String(s))
142+
@inline function Base.codeunit(s::ShortString, i::Integer)
143+
@boundscheck checkbounds(s, i)
144+
_get_byte(s, i)
145+
end
135146

136147
Base.convert(::ShortString{T}, s::String) where {T} = ShortString{T}(s)
137148
Base.convert(::String, ss::ShortString) = String(ss)
138149

139150
Base.sizeof(s::ShortString) = Int(s.size_content & size_mask(s))
140151

141-
Base.firstindex(::ShortString) = 1
142152
Base.lastindex(s::ShortString) = sizeof(s)
143153
Base.ncodeunits(s::ShortString) = sizeof(s)
144154

@@ -162,7 +172,7 @@ end
162172

163173
@inline function Base.isascii(s::ShortString{T}) where {T}
164174
val = s.size_content >>> (8*size_bytes(T))
165-
for i in 1:(sizeof(T)-size_bytes(T))
175+
for i in 1:max_len(T)
166176
iszero(val & 0x80) || return false
167177
val >>>= 8
168178
end
@@ -197,29 +207,59 @@ end
197207
reinterpret(Char, _get_char(str, pos))
198208
end
199209

200-
function ==(s::ShortString{S}, b::Union{String, SubString{String}}) where {S}
201-
ncodeunits(b) == ncodeunits(s) || return false
202-
return s == ShortString{S}(b)
210+
@inline _mask_bytes(n) = ((1%UInt) << ((n & CHUNKMSK) << 3)) - 0x1
211+
212+
# Optimized version of checking for equality against a string
213+
function ==(a::ShortString, b::String)
214+
sz = sizeof(a)
215+
sizeof(b) == sz || return false
216+
sz == 0 || return true
217+
val = _swapped_str(a)
218+
@preserve b begin
219+
pnt = reinterpret(Ptr{UInt}, pointer(b))
220+
while sz >= sizeof(UInt)
221+
xor(val & typemax(UInt), unsafe_load(pnt)) == 0 || return false
222+
sz -= sizeof(UInt)
223+
val >>>= 8*sizeof(UInt)
224+
pnt += CHUNKSZ
225+
end
226+
return sz === 0 || val == (unsafe_load(pnt) & _mask_bytes(sz))
227+
end
203228
end
204-
function ==(s::ShortString, b::AbstractString)
205-
# Could be a string type that might not use UTF8 encoding and that we don't have a
206-
# constructor for. Defer to equality that type probably has defined on `String`
207-
return String(s) == b
229+
230+
# This can be optimized to be much faster, like the code in StrBase.jl, doing 4 or 8 byte
231+
# chunks, as above, but it has to deal with alignment. Will add to a later PR
232+
function ==(s::ShortString, b::SubString{String})
233+
sz = sizeof(s)
234+
sizeof(b) == sz || return false
235+
sz == 0 || return true
236+
val = _swapped_str(s)
237+
@preserve s begin
238+
pnt = pointer(b)
239+
while (sz -= 1) >= 0
240+
unsafe_load(pnt) == (val & 0xff) || return false
241+
pnt += 1
242+
val >>>= 8
243+
end
244+
end
245+
return true
208246
end
209247

210248
==(a::AbstractString, b::ShortString) = b == a
211-
function ==(a::ShortString{S}, b::ShortString{S}) where {S}
212-
return a.size_content == b.size_content
213-
end
214-
function ==(a::ShortString{A}, b::ShortString{B}) where {A,B}
215-
ncodeunits(a) == ncodeunits(b) || return false
216-
# compare if equal after dropping size bits and
217-
# flipping so that the empty bytes are at the start
218-
ntoh(a.size_content & ~size_mask(A)) == ntoh(b.size_content & ~size_mask(B))
219-
end
220249

221-
function Base.cmp(a::ShortString{S}, b::ShortString{S}) where {S}
222-
return cmp(a.size_content, b.size_content)
250+
==(a::ShortString{S}, b::ShortString{S}) where {S} = (a.size_content == b.size_content)
251+
252+
# compare if equal after dropping size bits and flipping so that the empty bytes are at the start
253+
==(a::ShortString, b::ShortString) = sizeof(a) == sizeof(b) && _swapped_str(a) == _swapped_str(b)
254+
255+
cmp(a::ShortString{S}, b::ShortString{S}) where {S} = cmp(a.size_content, b.size_content)
256+
257+
function cmp(a::ShortString{S}, b::ShortString{T}) where {S,T}
258+
if sizeof(T) > sizeof(S)
259+
cmp(ntoh(T(_swapped_str(a))) | T(sizeof(a)), b.size_content)
260+
else
261+
cmp(a.size_content, ntoh(T(_swapped_str(b))) | T(sizeof(b)))
262+
end
223263
end
224264

225265
promote_rule(::Type{String}, ::Type{ShortString{S}}) where {S} = String
@@ -240,9 +280,9 @@ size_content(s::ShortString) = s.size_content
240280
const def_types = (UInt32, UInt64, UInt128, UInt256, UInt512, UInt1024, UInt2048)
241281

242282
for T in def_types
243-
max_len = sizeof(T) - size_bytes(T)
244-
constructor_name = Symbol(:ShortString, max_len)
245-
macro_name = Symbol(:ss, max_len, :_str)
283+
maxlen = max_len(T)
284+
constructor_name = Symbol(:ShortString, maxlen)
285+
macro_name = Symbol(:ss, maxlen, :_str)
246286

247287
@eval const $constructor_name = ShortString{$T}
248288
@eval macro $(macro_name)(s)
@@ -257,17 +297,39 @@ which can be used to store the string
257297
If no type is large enough, then an `ArgumentError` is thrown
258298
"""
259299
function get_type(maxlen; types=def_types)
300+
maxlen < 1 && throw(ArgumentError("$maxlen is <= 0"))
260301
for T in types
261-
maxlen <= sizeof(T) - size_bytes(T) && return ShortString{T}
302+
maxlen <= max_len(T) && return ShortString{T}
262303
end
263304
throw(ArgumentError("$maxlen is too large to fit into any of the provided types: $types"))
264305
end
265306

307+
"""
308+
Create a ShortString, using the smallest ShortString that can fit the string, unless the second
309+
argument `maxlen` is passed.
310+
If the keyword argument `types` is passed with a list (a tuple or Vector) of Unsigned
311+
types, in order of their size, then one of those types will be used.
312+
"""
266313
ShortString(str::Union{String,SubString{String}}, maxlen = sizeof(str); types=def_types) =
267314
get_type(maxlen, types=types)(str)
268315

269-
macro ss_str(str, max="0")
270-
:( ShortString($str, $(parse(Int, max))) )
316+
"""
317+
Create a ShortString, using the smallest ShortString that can fit the string,
318+
unless it is optionally followed by a single ASCII character and a maximum length.
319+
`ss"foo"b255` indicates that a ShortString that can contain 255 bytes should be used.
320+
"""
321+
macro ss_str(str, max=nothing)
322+
if max === nothing
323+
maxlen = sizeof(str)
324+
elseif max isa Integer
325+
maxlen = max
326+
elseif max isa String
327+
maxlen = tryparse(Int, isdigit(max[1]) ? max : max[2:end])
328+
maxlen === nothing && throw(ArgumentError("Optional length $max not a valid Integer"))
329+
else
330+
throw(ArgumentError("Unsupported type $(typeof(max)) for optional length $max"))
331+
end
332+
:( ShortString($str, $maxlen) )
271333
end
272334

273335
fsort(v::Vector{ShortString{T}}; rev = false) where {T} =

test/runtests.jl

+10
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,13 @@ end
123123

124124
# Iterations
125125
@test collect(ShortString15("x∫yâz")) == ['x','','y','â','z']
126+
127+
@testset "Constructors" begin
128+
@test typeof(ShortString("foo")) === ShortString3
129+
@test typeof(ShortString("foo", 255)) === ShortString255
130+
@test typeof(ss"foo") == ShortString3
131+
@test typeof(ss"foo"b255) == ShortString255
132+
133+
@test_throws ErrorException ShortString("foobar", 3)
134+
@test_throws ErrorException ss"foobar"b3
135+
end

0 commit comments

Comments
 (0)