2
2
3
3
using BitIntegers: @define_integers
4
4
5
- import Base: unsafe_getindex, == , show , promote_rule
5
+ import Base: unsafe_getindex, == , cmp , promote_rule
6
6
using Base: @_inline_meta , @propagate_inbounds , @_propagate_inbounds_meta
7
7
import Base. GC: @preserve
8
8
9
+ """
10
+ Type for holding short, fixed maximum size strings efficiently
11
+ """
9
12
struct ShortString{T} <: AbstractString where {T}
10
13
size_content:: T
11
14
end
12
15
13
- """ Check if a string of size `sz` can be stored in ShortString{T}"""
14
- function check_size (T, sz)
15
- max_len = sizeof (T) - size_bytes (T) # the last few bytes are used to store the length
16
- if sz > max_len
17
- throw (ErrorException (" sizeof(::$T ) must be shorter than or equal to $(max_len) in length; you have supplied a string of size $sz " ))
18
- end
19
- end
16
+ """ The size of the chunk used to process String values"""
17
+ const CHUNKSZ = sizeof (UInt)
18
+
19
+ """ Mask used for alignment"""
20
+ const CHUNKMSK = (CHUNKSZ- 1 )% UInt
21
+
22
+ """ The number of bits in the chunk type used to process String values"""
23
+ const CHUNKBITS = sizeof (UInt) == 4 ? 32 : 64
20
24
21
25
""" Calculate the number of bytes required to store the size of the ShortString"""
22
26
size_bytes (:: Type{T} ) where {T} = (count_ones (sizeof (T)- 1 )+ 7 )>> 3
23
27
28
+ """ Calculate the maximum length in bytes that can be stored in this ShortString"""
29
+ max_len (T) = sizeof (T) - size_bytes (T)
30
+
31
+ """ Check if a string of size `sz` can be stored in ShortString{T}"""
32
+ @inline function check_size (T, sz)
33
+ maxlen = max_len (T)
34
+ sz > maxlen &&
35
+ throw (ErrorException (" sizeof(::$T ) must be shorter than or equal to $(maxlen) in length; you have supplied a string of size $sz " ))
36
+ end
37
+
24
38
""" Calculate a mask to get the size stored in the ShortString"""
25
39
size_mask (T) = T ((1 << (size_bytes (T)* 8 )) - 1 )
26
40
size_mask (s:: ShortString{T} ) where {T} = size_mask (T)
27
41
28
- """ The size of the chunk used to process String values"""
29
- const CHUNKSZ = sizeof (UInt)
30
-
31
- """ The number of bits in the chunk type used to process String values"""
32
- const CHUNKBITS = sizeof (UInt) == 4 ? 32 : 64
42
+ """ Get the contents of the ShortString without the size, in native order"""
43
+ _swapped_str (s:: ShortString ) = ntoh (s. size_content & ~ size_mask (s))
33
44
34
45
""" Internal function to pick up a byte at the given index in a ShortString"""
35
46
@inline _get_byte (s:: ShortString , i:: Int ) = (s. size_content >>> (8 * (sizeof (s) - i)))% UInt8
@@ -105,8 +116,7 @@ function ShortString{T}(s::ShortString{S}) where {T, S}
105
116
# size_mask(S) will return a mask for getting the size for Shorting Strings in (content size)
106
117
# format, so something like 00001111 in binary.
107
118
# ~size_mask(S) will yield 11110000 which can be used as a mask to extract the content
108
- content = ntoh (T (ntoh (s. size_content & ~ size_mask (S))))
109
- ShortString {T} (content | T (sz))
119
+ ShortString {T} (ntoh (T (_swapped_str (s))) | T (sz))
110
120
end
111
121
112
122
""" Amount to shift ShortString value by for each UInt sized chunk"""
@@ -129,16 +139,16 @@ function String(s::ShortString{T}) where {T}
129
139
end
130
140
131
141
Base. codeunit (s:: ShortString ) = UInt8
132
- Base. codeunit (s:: ShortString , i) = codeunits (String (s), i)
133
- Base. codeunit (s:: ShortString , i:: Integer ) = codeunit (String (s), i)
134
- Base. codeunits (s:: ShortString ) = codeunits (String (s))
142
+ @inline function Base. codeunit (s:: ShortString , i:: Integer )
143
+ @boundscheck checkbounds (s, i)
144
+ _get_byte (s, i)
145
+ end
135
146
136
147
Base. convert (:: ShortString{T} , s:: String ) where {T} = ShortString {T} (s)
137
148
Base. convert (:: String , ss:: ShortString ) = String (ss)
138
149
139
150
Base. sizeof (s:: ShortString ) = Int (s. size_content & size_mask (s))
140
151
141
- Base. firstindex (:: ShortString ) = 1
142
152
Base. lastindex (s:: ShortString ) = sizeof (s)
143
153
Base. ncodeunits (s:: ShortString ) = sizeof (s)
144
154
162
172
163
173
@inline function Base. isascii (s:: ShortString{T} ) where {T}
164
174
val = s. size_content >>> (8 * size_bytes (T))
165
- for i in 1 : ( sizeof (T) - size_bytes (T) )
175
+ for i in 1 : max_len (T )
166
176
iszero (val & 0x80 ) || return false
167
177
val >>>= 8
168
178
end
@@ -197,29 +207,59 @@ end
197
207
reinterpret (Char, _get_char (str, pos))
198
208
end
199
209
200
- function == (s:: ShortString{S} , b:: Union{String, SubString{String}} ) where {S}
201
- ncodeunits (b) == ncodeunits (s) || return false
202
- return s == ShortString {S} (b)
210
+ @inline _mask_bytes (n) = ((1 % UInt) << ((n & CHUNKMSK) << 3 )) - 0x1
211
+
212
+ # Optimized version of checking for equality against a string
213
+ function == (a:: ShortString , b:: String )
214
+ sz = sizeof (a)
215
+ sizeof (b) == sz || return false
216
+ sz == 0 || return true
217
+ val = _swapped_str (a)
218
+ @preserve b begin
219
+ pnt = reinterpret (Ptr{UInt}, pointer (b))
220
+ while sz >= sizeof (UInt)
221
+ xor (val & typemax (UInt), unsafe_load (pnt)) == 0 || return false
222
+ sz -= sizeof (UInt)
223
+ val >>>= 8 * sizeof (UInt)
224
+ pnt += CHUNKSZ
225
+ end
226
+ return sz === 0 || val == (unsafe_load (pnt) & _mask_bytes (sz))
227
+ end
203
228
end
204
- function == (s:: ShortString , b:: AbstractString )
205
- # Could be a string type that might not use UTF8 encoding and that we don't have a
206
- # constructor for. Defer to equality that type probably has defined on `String`
207
- return String (s) == b
229
+
230
+ # This can be optimized to be much faster, like the code in StrBase.jl, doing 4 or 8 byte
231
+ # chunks, as above, but it has to deal with alignment. Will add to a later PR
232
+ function == (s:: ShortString , b:: SubString{String} )
233
+ sz = sizeof (s)
234
+ sizeof (b) == sz || return false
235
+ sz == 0 || return true
236
+ val = _swapped_str (s)
237
+ @preserve s begin
238
+ pnt = pointer (b)
239
+ while (sz -= 1 ) >= 0
240
+ unsafe_load (pnt) == (val & 0xff ) || return false
241
+ pnt += 1
242
+ val >>>= 8
243
+ end
244
+ end
245
+ return true
208
246
end
209
247
210
248
== (a:: AbstractString , b:: ShortString ) = b == a
211
- function == (a:: ShortString{S} , b:: ShortString{S} ) where {S}
212
- return a. size_content == b. size_content
213
- end
214
- function == (a:: ShortString{A} , b:: ShortString{B} ) where {A,B}
215
- ncodeunits (a) == ncodeunits (b) || return false
216
- # compare if equal after dropping size bits and
217
- # flipping so that the empty bytes are at the start
218
- ntoh (a. size_content & ~ size_mask (A)) == ntoh (b. size_content & ~ size_mask (B))
219
- end
220
249
221
- function Base. cmp (a:: ShortString{S} , b:: ShortString{S} ) where {S}
222
- return cmp (a. size_content, b. size_content)
250
+ == (a:: ShortString{S} , b:: ShortString{S} ) where {S} = (a. size_content == b. size_content)
251
+
252
+ # compare if equal after dropping size bits and flipping so that the empty bytes are at the start
253
+ == (a:: ShortString , b:: ShortString ) = sizeof (a) == sizeof (b) && _swapped_str (a) == _swapped_str (b)
254
+
255
+ cmp (a:: ShortString{S} , b:: ShortString{S} ) where {S} = cmp (a. size_content, b. size_content)
256
+
257
+ function cmp (a:: ShortString{S} , b:: ShortString{T} ) where {S,T}
258
+ if sizeof (T) > sizeof (S)
259
+ cmp (ntoh (T (_swapped_str (a))) | T (sizeof (a)), b. size_content)
260
+ else
261
+ cmp (a. size_content, ntoh (T (_swapped_str (b))) | T (sizeof (b)))
262
+ end
223
263
end
224
264
225
265
promote_rule (:: Type{String} , :: Type{ShortString{S}} ) where {S} = String
@@ -240,9 +280,9 @@ size_content(s::ShortString) = s.size_content
240
280
const def_types = (UInt32, UInt64, UInt128, UInt256, UInt512, UInt1024, UInt2048)
241
281
242
282
for T in def_types
243
- max_len = sizeof (T) - size_bytes (T)
244
- constructor_name = Symbol (:ShortString , max_len )
245
- macro_name = Symbol (:ss , max_len , :_str )
283
+ maxlen = max_len (T)
284
+ constructor_name = Symbol (:ShortString , maxlen )
285
+ macro_name = Symbol (:ss , maxlen , :_str )
246
286
247
287
@eval const $ constructor_name = ShortString{$ T}
248
288
@eval macro $ (macro_name)(s)
@@ -257,17 +297,39 @@ which can be used to store the string
257
297
If no type is large enough, then an `ArgumentError` is thrown
258
298
"""
259
299
function get_type (maxlen; types= def_types)
300
+ maxlen < 1 && throw (ArgumentError (" $maxlen is <= 0" ))
260
301
for T in types
261
- maxlen <= sizeof (T) - size_bytes (T) && return ShortString{T}
302
+ maxlen <= max_len (T) && return ShortString{T}
262
303
end
263
304
throw (ArgumentError (" $maxlen is too large to fit into any of the provided types: $types " ))
264
305
end
265
306
307
+ """
308
+ Create a ShortString, using the smallest ShortString that can fit the string, unless the second
309
+ argument `maxlen` is passed.
310
+ If the keyword argument `types` is passed with a list (a tuple or Vector) of Unsigned
311
+ types, in order of their size, then one of those types will be used.
312
+ """
266
313
ShortString (str:: Union{String,SubString{String}} , maxlen = sizeof (str); types= def_types) =
267
314
get_type (maxlen, types= types)(str)
268
315
269
- macro ss_str (str, max= " 0" )
270
- :( ShortString ($ str, $ (parse (Int, max))) )
316
+ """
317
+ Create a ShortString, using the smallest ShortString that can fit the string,
318
+ unless it is optionally followed by a single ASCII character and a maximum length.
319
+ `ss"foo"b255` indicates that a ShortString that can contain 255 bytes should be used.
320
+ """
321
+ macro ss_str (str, max= nothing )
322
+ if max === nothing
323
+ maxlen = sizeof (str)
324
+ elseif max isa Integer
325
+ maxlen = max
326
+ elseif max isa String
327
+ maxlen = tryparse (Int, isdigit (max[1 ]) ? max : max[2 : end ])
328
+ maxlen === nothing && throw (ArgumentError (" Optional length $max not a valid Integer" ))
329
+ else
330
+ throw (ArgumentError (" Unsupported type $(typeof (max)) for optional length $max " ))
331
+ end
332
+ :( ShortString ($ str, $ maxlen) )
271
333
end
272
334
273
335
fsort (v:: Vector{ShortString{T}} ; rev = false ) where {T} =
0 commit comments