Skip to content

Commit ad06687

Browse files
Make S"\xff" throw an error like bare "\xff" does.
We're getting very close to parity between the how the parser interprets strings and how the julia string implementation handles strings. It would be nice if instead of this: julia> S"\xff" invalid UTF-8 sequence syntax error: error expanding macro S_str when S_str throws an error that error message is printed after the "syntax error:" prefix.
1 parent 874b7b4 commit ad06687

File tree

4 files changed

+14
-6
lines changed

4 files changed

+14
-6
lines changed

j/string.j

+10-3
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,13 @@ end
417417

418418
unescape_string(s::String) = print_to_string(print_unescaped, s)
419419

420+
## checking UTF-8 validity ##
421+
422+
is_valid_utf8(s::ByteString) = is_valid_utf8(s.data)
423+
is_valid_utf8(a::Array{Uint8,1}) =
424+
bool(ccall(:u8_isvalid, Int32, (Ptr{Uint8}, Int32), a, length(a)))
425+
check_utf8(s::ByteString) = is_valid_utf8(s) ? s : error("invalid UTF-8 sequence")
426+
420427
## string interpolation parsing ##
421428

422429
function interp_parse(str::String, unescape::Function)
@@ -450,14 +457,14 @@ function interp_parse(str::String, unescape::Function)
450457
!isa(strs[1],String) ? expr(:call,:string,strs[1]) : strs[1]
451458
end
452459

453-
interp_parse(str::String) = interp_parse(str, unescape_string)
460+
interp_parse(str::String) = interp_parse(str, s->check_utf8(unescape_string(s)))
454461

455462
## core string macros ##
456463

457464
macro str(s); interp_parse(s); end
458465
macro S_str(s); interp_parse(s); end
459-
macro I_str(s); interp_parse(s, unbackslash); end
460-
macro E_str(s); unescape_string(s); end
466+
macro I_str(s); interp_parse(s, s->unbackslash(s)); end
467+
macro E_str(s); check_utf8(unescape_string(s)); end
461468

462469
## shell-like command parsing ##
463470

src/julia-parser.scm

+2-2
Original file line numberDiff line numberDiff line change
@@ -916,7 +916,7 @@
916916
(loop (read-char (ts:port s))))))
917917
(let ((str (unescape-string (io.tostring! b))))
918918
(if (= (length str) 1)
919-
;; one byte, e.g. '\xff'. maybe not valid utf-8, but we
919+
;; one byte, e.g. '\xff'. maybe not valid UTF-8, but we
920920
;; want to use the raw value as a codepoint in this case.
921921
(wchar (aref str 0))
922922
(if (or (not (= (string-length str) 1))
@@ -1020,7 +1020,7 @@
10201020
`(macrocall str ,(car ps))
10211021
(let ((str (unescape-string (car ps))))
10221022
(if (not (string.isutf8 str))
1023-
(error "invalid utf-8 sequence"))
1023+
(error "invalid UTF-8 sequence"))
10241024
str))))
10251025

10261026
;; macro call

src/julia.expmap

+1
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
ios_getc;
8282
ios_getutf8;
8383
jl_getutf8;
84+
u8_isvalid;
8485
ios_close;
8586
ios_flush;
8687
ios_read;

src/support/utf8.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ size_t u8_vprintf(const char *fmt, va_list ap);
119119
size_t u8_printf(const char *fmt, ...);
120120

121121
/* determine whether a sequence of bytes is valid UTF-8. length is in bytes */
122-
int u8_isvalid(const char *str, int length);
122+
DLLEXPORT int u8_isvalid(const char *str, int length);
123123

124124
/* reverse a UTF-8 string. len is length in bytes. dest and src must both
125125
be allocated to at least len+1 bytes. returns 1 for error, 0 otherwise */

0 commit comments

Comments
 (0)