From 8afd70d929f0f2f531997494c90fb0dd18ce7fc8 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 7 Jul 2016 15:30:00 -0400 Subject: [PATCH 1/9] export and document transcode from #16974, add transcode(String, x) and transcode(T, ::String) convenience methods --- base/c.jl | 16 +++++++++------- base/env.jl | 4 ++-- base/exports.jl | 1 + base/file.jl | 4 ++-- base/interactiveutil.jl | 2 +- base/libc.jl | 2 +- base/path.jl | 4 ++-- test/misc.jl | 8 +++++++- 8 files changed, 25 insertions(+), 16 deletions(-) diff --git a/base/c.jl b/base/c.jl index 97a1815b7d23c..ab643f211f646 100644 --- a/base/c.jl +++ b/base/c.jl @@ -130,18 +130,20 @@ end # transcoding between data in UTF-8 and UTF-16 for Windows APIs """ - Base.transcode(T,src::Vector{U}) + transcode(T, src) -Transcodes unicode data `src` to a different encoding, where `U` and `T` are the integers -denoting the input and output code units. Currently supported are UTF-8 and UTF-16, which -are denoted by integers `UInt8` and `UInt16`, respectively. - -NULs are handled like any other character (i.e. the output will be NUL-terminated if and -only if the `src` is). +Convert string data between Unicode encodings. `src` is either a +`String` or an `Vector{UIntXX}` of UTF-XX code units, where +`XX` is 8 or 16. `T` indicates the encoding of the return value: +`String` to return a (UTF-8 encoded) `String` or `UIntXX` +to return a `Vector{UIntXX}` of the UTF-`XX` data. """ function transcode end + transcode{T<:Union{UInt8,UInt16}}(::Type{T}, src::Vector{T}) = src transcode(::Type{Int32}, src::Vector{UInt32}) = reinterpret(Int32, src) +transcode(T, src::String) = transcode(T, src.data) +transcode(::Type{String}, src) = String(transcode(UInt8, src)) function transcode(::Type{UInt16}, src::Vector{UInt8}) dst = UInt16[] diff --git a/base/env.jl b/base/env.jl index cb21ecbe0dcf4..0f41a5bbf170e 100644 --- a/base/env.jl +++ b/base/env.jl @@ -19,7 +19,7 @@ function access_env(onError::Function, str::AbstractString) error(string("getenv: ", str, ' ', len, "-1 != ", ret, ": ", Libc.FormatMessage())) end pop!(val) # NUL - return String(transcode(UInt8, val)) + return transcode(String, val) end function _setenv(svar::AbstractString, sval::AbstractString, overwrite::Bool=true) @@ -97,7 +97,7 @@ function next(hash::EnvHash, block::Tuple{Ptr{UInt16},Ptr{UInt16}}) len = ccall(:wcslen, UInt, (Ptr{UInt16},), pos) buf = Array{UInt16}(len) unsafe_copy!(pointer(buf), pos, len) - env = String(transcode(UInt8, buf)) + env = transcode(String, buf) m = match(r"^(=?[^=]+)=(.*)$"s, env) if m === nothing error("malformed environment entry: $env") diff --git a/base/exports.jl b/base/exports.jl index 8641e82cd8fba..d3046082fdb84 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -874,6 +874,7 @@ export strip, strwidth, summary, + transcode, ucfirst, unescape_string, uppercase, diff --git a/base/file.jl b/base/file.jl index 97aa73a31aee3..396ba739f2cd4 100644 --- a/base/file.jl +++ b/base/file.jl @@ -203,7 +203,7 @@ function tempdir() error("GetTempPath failed: $(Libc.FormatMessage())") end resize!(temppath,lentemppath) - return String(transcode(UInt8, temppath)) + return transcode(String, temppath) end tempname(uunique::UInt32=UInt32(0)) = tempname(tempdir(), uunique) const temp_prefix = cwstring("jl_") @@ -216,7 +216,7 @@ function tempname(temppath::AbstractString,uunique::UInt32) error("GetTempFileName failed: $(Libc.FormatMessage())") end resize!(tname,lentname) - return String(transcode(UInt8, tname)) + return transcode(String, tname) end function mktemp(parent=tempdir()) filename = tempname(parent, UInt32(0)) diff --git a/base/interactiveutil.jl b/base/interactiveutil.jl index ce4274d2551a1..6ac54783f30e4 100644 --- a/base/interactiveutil.jl +++ b/base/interactiveutil.jl @@ -150,7 +150,7 @@ elseif is_windows() len = 0 while unsafe_load(plock, len+1) != 0; len += 1; end # get Vector{UInt16}, transcode data to UTF-8, make a String of it - s = String(transcode(UInt8, unsafe_wrap(Array, plock, len))) + s = transcode(String, unsafe_wrap(Array, plock, len)) systemerror(:GlobalUnlock, 0==ccall((:GlobalUnlock, "kernel32"), stdcall, Cint, (Ptr{UInt16},), plock)) return s end diff --git a/base/libc.jl b/base/libc.jl index 6943d457ebde7..8020147d42ce1 100644 --- a/base/libc.jl +++ b/base/libc.jl @@ -277,7 +277,7 @@ if is_windows() buf = Array{UInt16}(len) unsafe_copy!(pointer(buf), p, len) ccall(:LocalFree,stdcall,Ptr{Void},(Ptr{Void},),p) - return String(transcode(UInt8, buf)) + return transcode(String, buf) end end diff --git a/base/path.jl b/base/path.jl index 494250dc29683..d19779734913a 100644 --- a/base/path.jl +++ b/base/path.jl @@ -136,7 +136,7 @@ function realpath(path::AbstractString) systemerror(:realpath, n == 0) x = n < length(buf) # is the buffer big enough? resize!(buf, n) # shrink if x, grow if !x - x && return String(transcode(UInt8, buf)) + x && return transcode(String, buf) end end @@ -150,7 +150,7 @@ function longpath(path::AbstractString) systemerror(:longpath, n == 0) x = n < length(buf) # is the buffer big enough? resize!(buf, n) # shrink if x, grow if !x - x && return String(transcode(UInt8, buf)) + x && return transcode(String, buf) end end diff --git a/test/misc.jl b/test/misc.jl index e3ae6b72e8ce6..dea52228a7105 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -209,7 +209,6 @@ whos(IOBuffer(), Tmp14173) # warm up @test @allocated(whos(IOBuffer(), Tmp14173)) < 10000 ## test conversion from UTF-8 to UTF-16 (for Windows APIs) -import Base.Libc: transcode # empty arrays @test transcode(UInt16, UInt8[]) == UInt16[] @@ -376,6 +375,13 @@ for (X,Y,Z) in ((V16,V16,V16), (I16,V16,I16), (V16,I16,V16), (V16,V16,I16), (I16 end end +let s = "abcα🐨\0x\0" + for T in (UInt8, UInt16) + @test transcode(T, s) == transcode(T, s.data) + @test transcode(String, transcode(T, s)) == s + end +end + # clipboard functionality if is_windows() for str in ("Hello, world.", "∀ x ∃ y", "") From 7576748749cf9436b8aeec9e5956c6c739828bbd Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 7 Jul 2016 15:49:26 -0400 Subject: [PATCH 2/9] docs --- NEWS.md | 4 ++++ base/c.jl | 2 +- doc/manual/strings.rst | 8 +++++--- doc/stdlib/strings.rst | 8 ++++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8e5aca14d4b6d..90746ee7cc7f8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -131,6 +131,9 @@ Library improvements `String(s)`, `unsafe_string(ptr)` (formerly `bytestring(ptr)`), and `unsafe_wrap(String, ptr)` (formerly `pointer_to_string`) ([#16731]). + * A `transcode(T, src)` function is now exported for converting data + between UTF-xx Unicode encodings ([#17323]). + * Most of the combinatorics functions have been moved from `Base` to the [Combinatorics.jl package](https://github.com/JuliaLang/Combinatorics.jl) ([#13897]). @@ -321,4 +324,5 @@ Deprecated or removed [#17075]: https://github.com/JuliaLang/julia/issues/17075 [#17266]: https://github.com/JuliaLang/julia/issues/17266 [#17300]: https://github.com/JuliaLang/julia/issues/17300 +[#17323]: https://github.com/JuliaLang/julia/issues/17323 [#17374]: https://github.com/JuliaLang/julia/issues/17374 diff --git a/base/c.jl b/base/c.jl index ab643f211f646..bc5002ee76cfb 100644 --- a/base/c.jl +++ b/base/c.jl @@ -132,7 +132,7 @@ end """ transcode(T, src) -Convert string data between Unicode encodings. `src` is either a +Convert string data between Unicode encodings. `src` is either a `String` or an `Vector{UIntXX}` of UTF-XX code units, where `XX` is 8 or 16. `T` indicates the encoding of the return value: `String` to return a (UTF-8 encoded) `String` or `UIntXX` diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst index 4adec34080742..5628c1d415e1d 100644 --- a/doc/manual/strings.rst +++ b/doc/manual/strings.rst @@ -352,14 +352,16 @@ exception handling required: y -Julia uses UTF-8 encoding by default, and support for new encodings can +Julia uses the UTF-8 encoding by default, and support for new encodings can be added by packages. For example, the `LegacyStrings.jl `_ package implements ``UTF16String`` and ``UTF32String`` types. Additional discussion of other encodings and how to implement support for them is beyond the scope of this document for the time being. For further discussion of UTF-8 encoding issues, -see the section below on `byte array literals <#Byte+Array+Literals>`_, -which goes into some greater detail. +see the section below on `byte array literals <#Byte+Array+Literals>`_. +The :func:`transcode` function is provided to convert data between +the various UTF-xx encodings, primarily for working with external +data and libraries. .. _man-string-interpolation: diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index b27e7616a1e1e..cefc05ca1ec1a 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -56,6 +56,14 @@ Convert a string to a contiguous byte array representation encoded as UTF-8 bytes. This representation is often appropriate for passing strings to C. +.. function:: transcode(T, src) + + .. Docstring generated from Julia source + + Convert string data between Unicode encodings. ``src`` is either a ``String`` or an ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8 or 16. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of the UTF-``XX`` data. + + Additional string encodings (e.g. UTF-32) are supported by the ``LegacyStrings`` package. + .. function:: unsafe_string(p::Ptr{UInt8}, [length::Integer]) .. Docstring generated from Julia source From aa371bc6dcac8ddf904da801f91906339bdb396a Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 7 Jul 2016 16:27:04 -0400 Subject: [PATCH 3/9] support UTF-32 in transcode --- base/c.jl | 16 ++++++++++++---- doc/stdlib/strings.rst | 4 +--- test/misc.jl | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/base/c.jl b/base/c.jl index bc5002ee76cfb..b70a56bfedfac 100644 --- a/base/c.jl +++ b/base/c.jl @@ -128,23 +128,31 @@ function cwstring(s::AbstractString) end end -# transcoding between data in UTF-8 and UTF-16 for Windows APIs +# transcoding between data in UTF-8 and UTF-16 for Windows APIs, +# and also UTF-32 for APIs using Cwchar_t on other platforms. + """ transcode(T, src) Convert string data between Unicode encodings. `src` is either a `String` or an `Vector{UIntXX}` of UTF-XX code units, where -`XX` is 8 or 16. `T` indicates the encoding of the return value: +`XX` is 8, 16, or 32. `T` indicates the encoding of the return value: `String` to return a (UTF-8 encoded) `String` or `UIntXX` -to return a `Vector{UIntXX}` of the UTF-`XX` data. +to return a `Vector{UIntXX}` of UTF-`XX` data. """ function transcode end transcode{T<:Union{UInt8,UInt16}}(::Type{T}, src::Vector{T}) = src -transcode(::Type{Int32}, src::Vector{UInt32}) = reinterpret(Int32, src) transcode(T, src::String) = transcode(T, src.data) transcode(::Type{String}, src) = String(transcode(UInt8, src)) +transcode(::Type{Int32}, src::Vector{UInt32}) = reinterpret(Int32, src) +transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::String) = T[T(c) for c in src] +transcode{T<:Union{Int32,UInt32}}(::Type{T}, src) = transcode(T, transcode(String, src)) +transcode{S<:Union{Int32,UInt32}}(T, src::Vector{S}) = transcode(T, transcode(String, src)) +transcode{S<:Union{Int32,UInt32}}(::Type{String}, src::Vector{S}) = string(map(Char, src)...) +transcode(::Type{UInt16}, src::Vector) = transcode(UInt16, transcode(UInt8, src)) + function transcode(::Type{UInt16}, src::Vector{UInt8}) dst = UInt16[] i, n = 1, length(src) diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index cefc05ca1ec1a..b2b44803325f2 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -60,9 +60,7 @@ .. Docstring generated from Julia source - Convert string data between Unicode encodings. ``src`` is either a ``String`` or an ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8 or 16. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of the UTF-``XX`` data. - - Additional string encodings (e.g. UTF-32) are supported by the ``LegacyStrings`` package. + Convert string data between Unicode encodings. ``src`` is either a ``String`` or an ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. .. function:: unsafe_string(p::Ptr{UInt8}, [length::Integer]) diff --git a/test/misc.jl b/test/misc.jl index dea52228a7105..16066830498e9 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -376,7 +376,7 @@ for (X,Y,Z) in ((V16,V16,V16), (I16,V16,I16), (V16,I16,V16), (V16,V16,I16), (I16 end let s = "abcα🐨\0x\0" - for T in (UInt8, UInt16) + for T in (UInt8, UInt16, UInt32, Int32) @test transcode(T, s) == transcode(T, s.data) @test transcode(String, transcode(T, s)) == s end From 66f9155ca3d6db4559720f5f100030aaa683492d Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 7 Jul 2016 16:48:35 -0400 Subject: [PATCH 4/9] don't use splatting for UTF-32 to String conversion --- base/c.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/base/c.jl b/base/c.jl index b70a56bfedfac..1db5fbf762519 100644 --- a/base/c.jl +++ b/base/c.jl @@ -150,7 +150,11 @@ transcode(::Type{Int32}, src::Vector{UInt32}) = reinterpret(Int32, src) transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::String) = T[T(c) for c in src] transcode{T<:Union{Int32,UInt32}}(::Type{T}, src) = transcode(T, transcode(String, src)) transcode{S<:Union{Int32,UInt32}}(T, src::Vector{S}) = transcode(T, transcode(String, src)) -transcode{S<:Union{Int32,UInt32}}(::Type{String}, src::Vector{S}) = string(map(Char, src)...) +function transcode{S<:Union{Int32,UInt32}}(::Type{String}, src::Vector{S}) + buf = IOBuffer() + for c in src; print(buf, Char(c)); end + takebuf_string(buf) +end transcode(::Type{UInt16}, src::Vector) = transcode(UInt16, transcode(UInt8, src)) function transcode(::Type{UInt16}, src::Vector{UInt8}) From 35e7b697952de3c45279abb273206758cf258edf Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 7 Jul 2016 16:50:47 -0400 Subject: [PATCH 5/9] typo --- doc/stdlib/strings.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index b2b44803325f2..a9d95a595c1a3 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -60,7 +60,7 @@ .. Docstring generated from Julia source - Convert string data between Unicode encodings. ``src`` is either a ``String`` or an ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. + Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. .. function:: unsafe_string(p::Ptr{UInt8}, [length::Integer]) From fc74630924903f90fee40b1ac821bbf12c8a1118 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Fri, 8 Jul 2016 15:09:34 -0400 Subject: [PATCH 6/9] eliminate method ambiguities --- base/c.jl | 20 +++++++++----------- doc/stdlib/strings.rst | 2 +- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/base/c.jl b/base/c.jl index 1db5fbf762519..6e2c074ef2f39 100644 --- a/base/c.jl +++ b/base/c.jl @@ -138,24 +138,22 @@ Convert string data between Unicode encodings. `src` is either a `String` or an `Vector{UIntXX}` of UTF-XX code units, where `XX` is 8, 16, or 32. `T` indicates the encoding of the return value: `String` to return a (UTF-8 encoded) `String` or `UIntXX` -to return a `Vector{UIntXX}` of UTF-`XX` data. +to return a `Vector{UIntXX}` of UTF-`XX` data. Only conversion +to or from UTF-8 is currently supported. """ function transcode end -transcode{T<:Union{UInt8,UInt16}}(::Type{T}, src::Vector{T}) = src -transcode(T, src::String) = transcode(T, src.data) -transcode(::Type{String}, src) = String(transcode(UInt8, src)) - -transcode(::Type{Int32}, src::Vector{UInt32}) = reinterpret(Int32, src) +transcode{T<:Union{UInt8,UInt16,UInt32,Int32}}(::Type{T}, src::Vector{T}) = src transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::String) = T[T(c) for c in src] -transcode{T<:Union{Int32,UInt32}}(::Type{T}, src) = transcode(T, transcode(String, src)) -transcode{S<:Union{Int32,UInt32}}(T, src::Vector{S}) = transcode(T, transcode(String, src)) -function transcode{S<:Union{Int32,UInt32}}(::Type{String}, src::Vector{S}) +transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::Vector{UInt8}) = transcode(T, String(src)) +function transcode{S<:Union{Int32,UInt32}}(::Type{UInt8}, src::Vector{S}) buf = IOBuffer() for c in src; print(buf, Char(c)); end - takebuf_string(buf) + takebuf_array(buf) end -transcode(::Type{UInt16}, src::Vector) = transcode(UInt16, transcode(UInt8, src)) +transcode(::Type{String}, src::String) = src +transcode(T, src::String) = transcode(T, src.data) +transcode(::Type{String}, src) = String(transcode(UInt8, src)) function transcode(::Type{UInt16}, src::Vector{UInt8}) dst = UInt16[] diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index a9d95a595c1a3..636b3485dc097 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -60,7 +60,7 @@ .. Docstring generated from Julia source - Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. + Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. Only conversion to or from UTF-8 is currently supported. .. function:: unsafe_string(p::Ptr{UInt8}, [length::Integer]) From bef4d198ad17b261f5f6c825fa4fef1c3720d91b Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 9 Jul 2016 10:20:16 -0400 Subject: [PATCH 7/9] re-run genstdlib --- doc/stdlib/strings.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index 636b3485dc097..f0ae5751d5ab6 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -60,7 +60,7 @@ .. Docstring generated from Julia source - Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. Only conversion to or from UTF-8 is currently supported. + Convert string data between Unicode encodings. ``src`` is either a ``String`` or an ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. Only conversion to or from UTF-8 is currently supported. .. function:: unsafe_string(p::Ptr{UInt8}, [length::Integer]) From c49099d0d854eecb58e0279db4871e18ea248ec6 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Mon, 11 Jul 2016 12:19:55 -0400 Subject: [PATCH 8/9] doc clarification --- base/c.jl | 13 ++++++++++--- doc/stdlib/strings.rst | 6 +++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/base/c.jl b/base/c.jl index 6e2c074ef2f39..6c2f169377e9b 100644 --- a/base/c.jl +++ b/base/c.jl @@ -135,11 +135,18 @@ end transcode(T, src) Convert string data between Unicode encodings. `src` is either a -`String` or an `Vector{UIntXX}` of UTF-XX code units, where +`String` or a `Vector{UIntXX}` of UTF-XX code units, where `XX` is 8, 16, or 32. `T` indicates the encoding of the return value: `String` to return a (UTF-8 encoded) `String` or `UIntXX` -to return a `Vector{UIntXX}` of UTF-`XX` data. Only conversion -to or from UTF-8 is currently supported. +to return a `Vector{UIntXX}` of UTF-`XX` data. (The alias `Cwchar_t` +also be used as the integer type for converting `wchar_t*` strings +used by external C libraries.) + +The `transcode` function succeeds as long as the input data can be +reasonably represented in the target encoding; it always succeeds for +conversions between UTF-XX encodings, even for invalid Unicode data. + +Only conversion to/from UTF-8 is currently supported. """ function transcode end diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index f0ae5751d5ab6..e0d93fb9c66e8 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -60,7 +60,11 @@ .. Docstring generated from Julia source - Convert string data between Unicode encodings. ``src`` is either a ``String`` or an ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. Only conversion to or from UTF-8 is currently supported. + Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. (The alias ``Cwchar_t`` also be used as the integer type for converting ``wchar_t*`` strings used by external C libraries.) + + The ``transcode`` function succeeds as long as the input data can be reasonably represented in the target encoding; it always succeeds for conversions between UTF-XX encodings, even for invalid Unicode data. + + Only conversion to/from UTF-8 is currently supported. .. function:: unsafe_string(p::Ptr{UInt8}, [length::Integer]) From d10a1e84033ad26875de2c721f014b9a6289c6d1 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Mon, 11 Jul 2016 12:30:32 -0400 Subject: [PATCH 9/9] typo --- base/c.jl | 2 +- doc/stdlib/strings.rst | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/base/c.jl b/base/c.jl index 6c2f169377e9b..8bd9b85179b24 100644 --- a/base/c.jl +++ b/base/c.jl @@ -139,7 +139,7 @@ Convert string data between Unicode encodings. `src` is either a `XX` is 8, 16, or 32. `T` indicates the encoding of the return value: `String` to return a (UTF-8 encoded) `String` or `UIntXX` to return a `Vector{UIntXX}` of UTF-`XX` data. (The alias `Cwchar_t` -also be used as the integer type for converting `wchar_t*` strings +can also be used as the integer type, for converting `wchar_t*` strings used by external C libraries.) The `transcode` function succeeds as long as the input data can be diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index e0d93fb9c66e8..1e6cd2a46b5cb 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -60,7 +60,7 @@ .. Docstring generated from Julia source - Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. (The alias ``Cwchar_t`` also be used as the integer type for converting ``wchar_t*`` strings used by external C libraries.) + Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. (The alias ``Cwchar_t`` can also be used as the integer type, for converting ``wchar_t*`` strings used by external C libraries.) The ``transcode`` function succeeds as long as the input data can be reasonably represented in the target encoding; it always succeeds for conversions between UTF-XX encodings, even for invalid Unicode data. @@ -482,4 +482,3 @@ .. Docstring generated from Julia source General unescaping of traditional C and Unicode escape sequences. Reverse of :func:`escape_string`\ . See also :func:`unescape_string`\ . -