JuliaLang · StefanKarpinski · Nov 10, 2017 · Nov 3, 2017 · Nov 3, 2017 · Nov 9, 2017
diff --git a/base/char.jl b/base/char.jl
@@ -1,8 +1,58 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-convert(::Type{Char}, x::UInt32) = reinterpret(Char, x)
+struct MalformedCharError <: Exception
+    char::Char
+end
+struct CodePointError <: Exception
+    code::Integer
+end
+@noinline malformed_char(c::Char) = throw(MalformedCharError(c))
+@noinline code_point_err(u::UInt32) = throw(CodePointError(u))
+
+function ismalformed(c::Char)
+    u = reinterpret(UInt32, c)
+    l1 = leading_ones(u) << 3
+    t0 = trailing_zeros(u) & 56
+    (l1 == 8) | (l1 + t0 > 32) |
+    (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0)
+end
+
+function convert(::Type{UInt32}, c::Char)
+    # TODO: use optimized inline LLVM
+    u = reinterpret(UInt32, c)
+    u < 0x80000000 && return reinterpret(UInt32, u >> 24)
+    l1 = leading_ones(u)
+    t0 = trailing_zeros(u) & 56
+    (l1 == 1) | (8l1 + t0 > 32) |
+    (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) &&
+        malformed_char(c)::Union{}
+    u &= 0xffffffff >> l1
+    u >>= t0
+    (u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) |
+    (u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6)
+end
+
+function convert(::Type{Char}, u::UInt32)
+    u < 0x80 && return reinterpret(Char, u << 24)
+    u < 0x00200000 || code_point_err(u)::Union{}
+    c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
+        ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000)
+    c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
+        u < 0x00010000 ? (c << 08) | 0xe0808000 :
+                         (c << 00) | 0xf0808080
+    reinterpret(Char, c)
+end
+
+function convert(::Type{T}, c::Char) where T <: Union{Int8,UInt8}
+    i = reinterpret(Int32, c)
+    i ≥ 0 ? ((i >>> 24) % T) : T(UInt32(c))
+end
+
+function convert(::Type{Char}, b::Union{Int8,UInt8})
+    0 ≤ b ≤ 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
+end
+
 convert(::Type{Char}, x::Number) = Char(UInt32(x))
-convert(::Type{UInt32}, x::Char) = reinterpret(UInt32, x)
 convert(::Type{T}, x::Char) where {T<:Number} = convert(T, UInt32(x))
 
 rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T)
@@ -29,11 +79,9 @@ done(c::Char, state) = state
 isempty(c::Char) = false
 in(x::Char, y::Char) = x == y
 
-==(x::Char, y::Char) = UInt32(x) == UInt32(y)
-isless(x::Char, y::Char) = UInt32(x) < UInt32(y)
-
-const hashchar_seed = 0xd4d64234
-hash(x::Char, h::UInt) = hash_uint64(((UInt64(x)+hashchar_seed)<<32) ⊻ UInt64(h))
+==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
+isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
+hash(x::Char, h::UInt) = hash(reinterpret(UInt32, x), hash(Char, h))
 
 -(x::Char, y::Char) = Int(x) - Int(y)
 -(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
@@ -66,21 +114,37 @@ function show(io::IO, c::Char)
     end
     if isprint(c)
         write(io, 0x27, c, 0x27)
-    else
+    elseif !ismalformed(c)
         u = UInt32(c)
         write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
         d = max(2, 8 - (leading_zeros(u) >> 2))
         while 0 < d
             write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
         end
         write(io, 0x27)
+    else # malformed
+        write(io, 0x27)
+        u = reinterpret(UInt32, c)
+        while true
+            a = hex_chars[((u >> 28) & 0xf) + 1]
+            b = hex_chars[((u >> 24) & 0xf) + 1]
+            write(io, 0x5c, 'x', a, b)
+            (u <<= 8) == 0 && break
+        end
+        write(io, 0x27)
     end
     return
 end
 
 function show(io::IO, ::MIME"text/plain", c::Char)
     show(io, c)
-    u = UInt32(c)
-    print(io, ": ", isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
-    print(io, " (category ", UTF8proc.category_abbrev(c), ": ", UTF8proc.category_string(c), ")")
+    if !ismalformed(c)
+        u = UInt32(c)
+        print(io, ": ", isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
+    else
+        print(io, ": Malformed UTF-8")
+    end
+    abr = UTF8proc.category_abbrev(c)
+    str = UTF8proc.category_string(c)
+    print(io, " (category ", abr, ": ", str, ")")
 end
diff --git a/base/filesystem.jl b/base/filesystem.jl
@@ -149,6 +149,26 @@ function read(f::File, ::Type{UInt8})
     return ret % UInt8
 end
 
+function read(f::File, ::Type{Char})
+    b0 = read(f, UInt8)
+    l = 8(4-leading_ones(b0))
+    c = UInt32(b0) << 24
+    if l < 24
+        s = 16
+        while s ≥ l && !eof(f)
+            p = position(f)
+            b = read(f, UInt8)
+            if b & 0xc0 != 0x80
+                seek(f, p)
+                break
+            end
+            c |= UInt32(b) << s
+            s -= 8
+        end
+    end
+    return reinterpret(Char, c)
+end
+
 function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt)
     check_open(f)
     ret = ccall(:jl_fs_read, Int32, (Int32, Ptr{Void}, Csize_t),

diff --git a/base/intfuncs.jl b/base/intfuncs.jl
@@ -654,8 +654,8 @@ for sym in (:bin, :oct, :dec, :hex)
     @eval begin
         ($sym)(x::Unsigned, p::Int) = ($sym)(x,p,false)
         ($sym)(x::Unsigned)         = ($sym)(x,1,false)
-        ($sym)(x::Char, p::Int)     = ($sym)(unsigned(x),p,false)
-        ($sym)(x::Char)             = ($sym)(unsigned(x),1,false)
+        ($sym)(x::Char, p::Int)     = ($sym)(UInt32(x),p,false)
+        ($sym)(x::Char)             = ($sym)(UInt32(x),1,false)
         ($sym)(x::Integer, p::Int)  = ($sym)(unsigned(abs(x)),p,x<0)
         ($sym)(x::Integer)          = ($sym)(unsigned(abs(x)),1,x<0)
     end

diff --git a/base/io.jl b/base/io.jl
@@ -432,25 +432,13 @@ function write(s::IO, a::SubArray{T,N,<:Array}) where {T,N}
     end
 end
 
-
-function write(s::IO, ch::Char)
-    c = reinterpret(UInt32, ch)
-    if c < 0x80
-        return write(s, c%UInt8)
-    elseif c < 0x800
-        return (write(s, (( c >> 6          ) | 0xC0)%UInt8)) +
-               (write(s, (( c        & 0x3F ) | 0x80)%UInt8))
-    elseif c < 0x10000
-        return (write(s, (( c >> 12         ) | 0xE0)%UInt8)) +
-               (write(s, (((c >> 6)  & 0x3F ) | 0x80)%UInt8)) +
-               (write(s, (( c        & 0x3F ) | 0x80)%UInt8))
-    elseif c < 0x110000
-        return (write(s, (( c >> 18         ) | 0xF0)%UInt8)) +
-               (write(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8)) +
-               (write(s, (((c >> 6)  & 0x3F ) | 0x80)%UInt8)) +
-               (write(s, (( c        & 0x3F ) | 0x80)%UInt8))
-    else
-        return write(s, '\ufffd')
+function write(io::IO, c::Char)
+    u = bswap(reinterpret(UInt32, c))
+    n = 1
+    while true
+        write(io, u % UInt8)
+        (u >>= 8) == 0 && return n
+        n += 1
     end
 end
 
@@ -493,31 +481,28 @@ function read!(s::IO, a::Array{T}) where T
     return a
 end
 
-function read(s::IO, ::Type{Char})
-    ch = read(s, UInt8)
-    if ch < 0x80
-        return Char(ch)
-    end
-
-    # mimic utf8.next function
-    trailing = Base.utf8_trailing[ch+1]
-    c::UInt32 = 0
-    for j = 1:trailing
-        c += ch
-        c <<= 6
-        ch = read(s, UInt8)
+function read(io::IO, ::Type{Char})
+    b0 = read(io, UInt8)
+    l = 8(4-leading_ones(b0))
+    c = UInt32(b0) << 24
+    if l < 24
+        s = 16
+        while s ≥ l && !eof(io)
+            peek(io) & 0xc0 == 0x80 || break
+            b = read(io, UInt8)
+            c |= UInt32(b) << s
+            s -= 8
+        end
     end
-    c += ch
-    c -= Base.utf8_offset[trailing+1]
-    return Char(c)
+    return reinterpret(Char, c)
 end
 
 # readuntil_string is useful below since it has
 # an optimized method for s::IOStream
 readuntil_string(s::IO, delim::UInt8) = String(readuntil(s, delim))
 
 function readuntil(s::IO, delim::Char)
-    if delim < Char(0x80)
+    if delim ≤ '\x7f'
         return readuntil_string(s, delim % UInt8)
     end
     out = IOBuffer()
@@ -598,7 +583,7 @@ function readuntil(io::IO, target::AbstractString)
     i = start(target)
     done(target, i) && return ""
     c, i = next(target, start(target))
-    if done(target, i) && c < Char(0x80)
+    if done(target, i) && c <= '\x7f'
         return readuntil_string(io, c % UInt8)
     end
     # decide how we can index target
@@ -625,14 +610,13 @@ function readuntil(io::IO, target::AbstractVector{T}) where T
     return out
 end
 
-
 """
     readchomp(x)
 
-Read the entirety of `x` as a string and remove a single trailing newline.
-Equivalent to `chomp!(read(x, String))`.
+Read the entirety of `x` as a string and remove a single trailing newline
+if there is one. Equivalent to `chomp(read(x, String))`.
 """
-readchomp(x) = chomp!(read(x, String))
+readchomp(x) = chomp(read(x, String))
 
 # read up to nb bytes into nb, returning # bytes read
 

diff --git a/base/iostream.jl b/base/iostream.jl
@@ -315,12 +315,13 @@ end
 
 ## low-level calls ##
 
-write(s::IOStream, b::UInt8) = Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
+function write(s::IOStream, b::UInt8)
+    iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
+    Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
+end
 
 function unsafe_write(s::IOStream, p::Ptr{UInt8}, nb::UInt)
-    if !iswritable(s)
-        throw(ArgumentError("write failed, IOStream is not writeable"))
-    end
+    iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
     return Int(ccall(:ios_write, Csize_t, (Ptr{Void}, Ptr{Void}, Csize_t), s.ios, p, nb))
 end
 
@@ -353,14 +354,6 @@ end
 
 ## text I/O ##
 
-function write(s::IOStream, c::Char)
-    if !iswritable(s)
-        throw(ArgumentError("write failed, IOStream is not writeable"))
-    end
-    Int(ccall(:ios_pututf8, Cint, (Ptr{Void}, UInt32), s.ios, c))
-end
-read(s::IOStream, ::Type{Char}) = Char(ccall(:jl_getutf8, UInt32, (Ptr{Void},), s.ios))
-
 take!(s::IOStream) =
     ccall(:jl_take_buffer, Vector{UInt8}, (Ptr{Void},), s.ios)
 
@@ -452,14 +445,23 @@ function read(s::IOStream, nb::Integer; all::Bool=true)
 end
 
 ## Character streams ##
-const _chtmp = Ref{Char}()
+
 function peekchar(s::IOStream)
-    if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{Char}), s, _chtmp) < 0
+    chref = Ref{UInt32}()
+    if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{UInt32}), s, chref) < 0
         return typemax(Char)
     end
-    return _chtmp[]
+    return Char(chref[])
 end
 
 function peek(s::IOStream)
     ccall(:ios_peekc, Cint, (Ptr{Void},), s)
 end
+
+function peek(s::IO)
+    mark(s)
+    try read(s, UInt8)
+    finally
+        reset(s)
+    end
+end
diff --git a/base/parse.jl b/base/parse.jl
@@ -224,12 +224,12 @@ end
 ## string to float functions ##
 
 tryparse(::Type{Float64}, s::String) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
-tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
+tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
 tryparse_internal(::Type{Float64}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
 tryparse_internal(::Type{Float64}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)
 
 tryparse(::Type{Float32}, s::String) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
-tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
+tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
 tryparse_internal(::Type{Float32}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
 tryparse_internal(::Type{Float32}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)
 

diff --git a/base/regex.jl b/base/regex.jl
@@ -303,8 +303,12 @@ struct SubstitutionString{T<:AbstractString} <: AbstractString
     string::T
 end
 
-endof(s::SubstitutionString) = endof(s.string)
-next(s::SubstitutionString, idx::Int) = next(s.string, idx)
+ncodeunits(s::SubstitutionString) = ncodeunits(s.string)
+codeunit(s::SubstitutionString) = codeunit(s.string)
+codeunit(s::SubstitutionString, i::Integer) = codeunit(s.string, i)
+isvalid(s::SubstitutionString, i::Integer) = isvalid(s.string, i)
+next(s::SubstitutionString, i::Integer) = next(s.string, i)
+
 function show(io::IO, s::SubstitutionString)
     print(io, "s")
     show(io, s.string)

diff --git a/base/repl/REPLCompletions.jl b/base/repl/REPLCompletions.jl
@@ -106,7 +106,7 @@ const sorted_keywords = [
     "primitive type", "quote", "return", "struct",
     "true", "try", "using", "while"]
 
-function complete_keyword(s::String)
+function complete_keyword(s::Union{String,SubString{String}})
     r = searchsorted(sorted_keywords, s)
     i = first(r)
     n = length(sorted_keywords)

diff --git a/base/stream.jl b/base/stream.jl
@@ -1148,6 +1148,14 @@ unmark(x::LibuvStream)   = unmark(x.buffer)
 reset(x::LibuvStream)    = reset(x.buffer)
 ismarked(x::LibuvStream) = ismarked(x.buffer)
 
+function peek(s::LibuvStream)
+    mark(s)
+    try read(s, UInt8)
+    finally
+        reset(s)
+    end
+end
+
 # BufferStream's are non-OS streams, backed by a regular IOBuffer
 mutable struct BufferStream <: LibuvStream
     buffer::IOBuffer