Skip to content

Commit 83bd7b1

Browse files
Use ASCIIString instead of Latin1String (closes #4).
It still remains to implement the optimizations this enables as well as making sure that all strings are output such that when input again via repl they are equal to the original string. Also fixes the UTF-8 test "suite" (fixes #9).
1 parent 23eaf3a commit 83bd7b1

17 files changed

+62
-43
lines changed

Makefile

+4-2
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ pcre_h.j:
2424
test: debug
2525
./julia tests.j
2626

27-
testall: test
27+
test-utf8:
2828
./julia test_utf8.j
2929

30+
testall: test test-utf8
31+
3032
SLOCCOUNT = sloccount \
3133
--addlang makefile \
3234
--personcost 100000 \
@@ -51,4 +53,4 @@ clean:
5153
cleanall: clean
5254
$(MAKE) -C src cleanother
5355

54-
.PHONY: default debug release julia-debug julia-release test testall sloccount clean cleanall
56+
.PHONY: default debug release julia-debug julia-release test test-* testall sloccount clean cleanall

ascii.j

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
## from src/boot.j
2+
# type ASCIIString <: String; data::Array{Uint8,1}; end
3+
4+
next(s::ASCIIString, i::Index) = (char(s.data[i]), i+1)
5+
6+
## overload methods for efficiency ##
7+
8+
length(s::ASCIIString) = length(s.data)
9+
cmp(a::ASCIIString, b::ASCIIString) = lexcmp(a.data, b.data)
10+
ind2chr(s::ASCIIString, i::Int) = i
11+
chr2ind(s::ASCIIString, i::Int) = i
12+
strchr(s::ASCIIString, c::Char) = c < 0x80 ? memchr(s.data, c) : error("char not found")
13+
nextind(s::ASCIIString, i::Int) = i
14+
prevind(s::ASCIIString, i::Int) = i-1
15+
strcat(s::ASCIIString, t::ASCIIString, x::ASCIIString...) = ASCIIString(strdatacat(s, t, x...))
16+
17+
## outputing ASCII strings ##
18+
19+
print(s::ASCIIString) = print(s.data)
20+
write(io, s::ASCIIString) = write(io, s.data)
21+
22+
## transcoding to ASCII ##
23+
24+
ascii(s::ASCIIString) = s
25+
function ascii(s::String)
26+
f = c -> (c < 0x80) ? uint8(c) : error("invalid ASCII code point: U+$(hex(c))")
27+
ASCIIString(map(f, chars(s)))
28+
end

expr.j

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
## symbols ##
22

3-
symbol(s::Latin1String) = symbol(s.data)
3+
symbol(s::ASCIIString) = symbol(s.data)
44
symbol(s::UTF8String) = symbol(s.data)
55
symbol(a::Array{Uint8,1}) =
66
ccall(:jl_symbol_n, Any, (Ptr{Uint8}, Int32), a, int32(length(a)))::Symbol

latin1.j

+4-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
## from boot.j:
2-
# type Latin1String <: String
3-
# data::Array{Uint8,1}
4-
# end
1+
type Latin1String <: String
2+
data::Array{Uint8,1}
3+
end
54

65
next(s::Latin1String, i::Index) = (char(s.data[i]), i+1)
76

@@ -27,7 +26,6 @@ write(io, s::Latin1String) = write(io, s.data)
2726

2827
latin1(s::Latin1String) = s
2928
function latin1(s::String)
30-
f = c -> (c <= 0xff) ? uint8(c) :
31-
error("invalid Latin-1 code point: U+$(hex(c))")
29+
f = c -> (c <= 0xff) ? uint8(c) : error("invalid Latin-1 code point: U+$(hex(c))")
3230
Latin1String(map(f, chars(s)))
3331
end

multi.j

+1-1
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ function identify_socket(otherid, fd, sock)
194194
@assert i < PGRP.myid
195195
PGRP.workers[i] = Worker(locs[i].host, locs[i].port, fd, sock)
196196
PGRP.workers[i].id = i
197-
#write(stdout_stream, latin1("$(PGRP.myid) heard from $i\n"))
197+
#write(stdout_stream, "$(PGRP.myid) heard from $i\n")
198198
()
199199
end
200200

src/alloc.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jl_type_t *jl_array_uint8_type;
3232
jl_type_t *jl_array_any_type;
3333
jl_struct_type_t *jl_weakref_type;
3434
jl_tag_type_t *jl_string_type;
35-
jl_struct_type_t *jl_latin1_string_type;
35+
jl_struct_type_t *jl_ascii_string_type;
3636
jl_struct_type_t *jl_utf8_string_type;
3737
jl_struct_type_t *jl_expr_type;
3838
jl_bits_type_t *jl_intrinsic_type;

src/array.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,8 @@ jl_value_t *jl_pchar_to_string(char *str, size_t len)
195195
{
196196
jl_array_t *a = jl_pchar_to_array(str, len);
197197
JL_GC_PUSH(&a);
198-
jl_struct_type_t* string_type = u8_isvalid(a->data, len) < 2 ?
199-
jl_latin1_string_type : jl_utf8_string_type;
198+
jl_struct_type_t* string_type = u8_isvalid(a->data, len) == 1 ? // ASCII
199+
jl_ascii_string_type : jl_utf8_string_type;
200200
jl_value_t *s = jl_apply((jl_function_t*)string_type, (jl_value_t**)&a, 1);
201201
JL_GC_POP();
202202
return s;

src/boot.j

+3-8
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,10 @@ isequal(w, v::WeakRef) = isequal(w, v.value)
117117

118118
abstract String
119119

120-
type Latin1String <: String
121-
data::Array{Uint8,1}
122-
end
123-
124-
type UTF8String <: String
125-
data::Array{Uint8,1}
126-
end
120+
type ASCIIString <: String; data::Array{Uint8,1}; end
121+
type UTF8String <: String; data::Array{Uint8,1}; end
127122

128-
typealias ByteString Union(Latin1String,UTF8String)
123+
typealias ByteString Union(ASCIIString,UTF8String)
129124

130125
abstract Exception
131126

src/dump.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ void jl_save_system_image(char *fname, char *startscriptname)
785785
jl_serialize_value(&f, jl_float64_type);
786786
jl_serialize_value(&f, jl_weakref_type);
787787
jl_serialize_value(&f, jl_string_type);
788-
jl_serialize_value(&f, jl_latin1_string_type);
788+
jl_serialize_value(&f, jl_ascii_string_type);
789789
jl_serialize_value(&f, jl_utf8_string_type);
790790
jl_serialize_value(&f, jl_errorexception_type);
791791
jl_serialize_value(&f, jl_typeerror_type);
@@ -852,7 +852,7 @@ void jl_restore_system_image(char *fname)
852852
jl_weakref_type->env = NULL;
853853
jl_weakref_type->linfo = NULL;
854854
jl_string_type = (jl_tag_type_t*)jl_deserialize_value(&f);
855-
jl_latin1_string_type = (jl_struct_type_t*)jl_deserialize_value(&f);
855+
jl_ascii_string_type = (jl_struct_type_t*)jl_deserialize_value(&f);
856856
jl_utf8_string_type = (jl_struct_type_t*)jl_deserialize_value(&f);
857857
jl_errorexception_type = (jl_struct_type_t*)jl_deserialize_value(&f);
858858
jl_typeerror_type = (jl_struct_type_t*)jl_deserialize_value(&f);

src/init.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ void jl_get_builtin_hooks()
251251
jl_weakref_type->env = NULL;
252252
jl_weakref_type->linfo = NULL;
253253
jl_string_type = (jl_tag_type_t*)global("String");
254-
jl_latin1_string_type = (jl_struct_type_t*)global("Latin1String");
254+
jl_ascii_string_type = (jl_struct_type_t*)global("ASCIIString");
255255
jl_utf8_string_type = (jl_struct_type_t*)global("UTF8String");
256256
jl_errorexception_type = (jl_struct_type_t*)global("ErrorException");
257257
jl_typeerror_type = (jl_struct_type_t*)global("TypeError");

src/julia.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ extern jl_struct_type_t *jl_array_type;
263263
extern jl_typename_t *jl_array_typename;
264264
extern jl_struct_type_t *jl_weakref_type;
265265
extern jl_tag_type_t *jl_string_type;
266-
extern jl_struct_type_t *jl_latin1_string_type;
266+
extern jl_struct_type_t *jl_ascii_string_type;
267267
extern jl_struct_type_t *jl_utf8_string_type;
268268
extern jl_struct_type_t *jl_errorexception_type;
269269
extern jl_struct_type_t *jl_typeerror_type;
@@ -401,9 +401,9 @@ void *allocb_permanent(size_t sz);
401401
#define jl_is_task(v) jl_typeis(v,jl_task_type)
402402
#define jl_is_func(v) (jl_is_func_type(jl_typeof(v)) || jl_is_struct_type(v))
403403
#define jl_is_function(v) jl_is_func(v)
404-
#define jl_is_latin1_string(v) jl_typeis(v,jl_latin1_string_type)
404+
#define jl_is_ascii_string(v) jl_typeis(v,jl_ascii_string_type)
405405
#define jl_is_utf8_string(v) jl_typeis(v,jl_utf8_string_type)
406-
#define jl_is_byte_string(v) (jl_is_latin1_string(v) || jl_is_utf8_string(v))
406+
#define jl_is_byte_string(v) (jl_is_ascii_string(v) || jl_is_utf8_string(v))
407407
#define jl_is_string(v) jl_subtype(v,(jl_value_t*)jl_string_type,1)
408408
#define jl_is_cpointer(v) jl_is_cpointer_type(jl_typeof(v))
409409
#define jl_is_pointer(v) jl_is_cpointer_type(jl_typeof(v))

start.j

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ set_current_output_stream(stdout_stream)
3636
stdin_stream = fdio(ccall(:jl_stdin, Int32, ()))
3737
stderr_stream = fdio(ccall(:jl_stderr, Int32, ()))
3838
load("string.j")
39-
load("latin1.j")
39+
load("ascii.j")
4040
load("utf8.j")
4141
load("show.j")
4242
load("regex.j")

string.j

+4-2
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,9 @@ function print_escaped(s::String, q::Bool, xmax::Char)
340340
if q; print('"'); end
341341
end
342342

343-
print_escaped(s::Latin1String, q) = print_escaped(s, q, '\xff')
343+
# TODO: make sure ASCII, Latin-1 and UTF-8 strings all get
344+
# printed so that when input back they are equivalent.
345+
344346
print_escaped(s::String, q) = print_escaped(s, q, '\x7f')
345347
print_escaped(s::String) = print_escaped(s, false)
346348
print_quoted (s::String) = print_escaped(s, true)
@@ -708,7 +710,7 @@ function uint2str(n::Int, b::Int)
708710
ccall(:uint2str, Ptr{Uint8},
709711
(Ptr{Uint8}, Ulong, Uint64, Uint32),
710712
data, ulong(sz), uint64(n), uint32(b))
711-
Latin1String(data[1:(sz-1)]) # cut out terminating NUL
713+
ASCIIString(data[1:(sz-1)]) # cut out terminating NUL
712714
end
713715

714716
uint2str(n::Int, b::Int, len::Int) = lpad(uint2str(n,b),len,'0')

sysimg.j

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ load("io.j")
3333
ccall(:jl_set_memio_func, Void, ())
3434
set_current_output_stream(make_stdout_stream()) # for error reporting
3535
load("string.j")
36-
load("latin1.j")
36+
load("ascii.j")
3737
load("utf8.j")
3838
load("show.j")
3939

table.j

+1-5
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,7 @@ function hash(a::Array)
8181
h
8282
end
8383

84-
# TODO: should we distinguish a UTF8String and
85-
# a Latin1String containing the same exact data?
86-
87-
hash(s::Union(UTF8String,Latin1String)) =
88-
ccall(:memhash32, Uint32, (Ptr{Void}, Size), s.data, length(s.data))
84+
hash(s::ByteString) = ccall(:memhash32, Uint32, (Ptr{Void}, Size), s.data, length(s.data))
8985

9086
# hash table
9187

test_utf8.j

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
utf32 = CharString(read(open("unicode/UTF-32LE.txt"), Char, 1112065)[2:]);
2-
utf8 = UTF8String(read(open("unicode/UTF-8.txt"), Uint8, 4382595)[4:]);
3-
@assert utf32 == utf8
1+
str1 = CharString(read(open("unicode/UTF-32LE.txt"), Char, 1112065)[2:]);
2+
str2 = UTF8String(read(open("unicode/UTF-8.txt"), Uint8, 4382595)[4:]);
3+
@assert str1 == str2
44

55
str1 = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"
66
str2 = CharString(

utf8.j

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
## from boot.j:
2-
# type UTF8String <: String
3-
# data::Array{Uint8,1}
4-
# end
1+
## from src/boot.j:
2+
# type UTF8String <: String; data::Array{Uint8,1}; end
53

64
## basic UTF-8 decoding & iteration ##
75

0 commit comments

Comments
 (0)