Skip to content

Commit 18fd160

Browse files
committed
Introduce Encoding parametric singleton type
First step towards efficient encoders for common encodings, as well as towards providing information about encodings. This also allows adding convenience methods to base I/O functions taking an additional encoding parameter without risking ambiguities.
1 parent 5f601c2 commit 18fd160

File tree

3 files changed

+448
-305
lines changed

3 files changed

+448
-305
lines changed

src/StringEncodings.jl

+44-30
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This file is a part of Julia. License is MIT: http://julialang.org/license
1+
# This file is a part of StringEncodings.jl. License is MIT: http://julialang.org/license
22

33
module StringEncodings
44
import Base: close, eof, flush, read, readall, write, show
@@ -8,6 +8,7 @@ export StringEncoder, StringDecoder, encode, decode, encodings
88
export StringEncodingError, OutputBufferError, IConvError
99
export InvalidEncodingError, InvalidSequenceError, IncompleteSequenceError
1010

11+
include("encodings.jl")
1112

1213
abstract StringEncodingError
1314

@@ -62,7 +63,7 @@ function iconv_close(cd::Ptr{Void})
6263
end
6364
end
6465

65-
function iconv_open(tocode, fromcode)
66+
function iconv_open(tocode::ASCIIString, fromcode::ASCIIString)
6667
p = ccall((:iconv_open, libiconv), Ptr{Void}, (Cstring, Cstring), tocode, fromcode)
6768
if p != Ptr{Void}(-1)
6869
return p
@@ -173,14 +174,16 @@ end
173174
## StringEncoder
174175

175176
"""
176-
StringEncoder(istream, to, from="UTF-8")
177+
StringEncoder(istream, to, from=enc"UTF-8")
177178
178179
Returns a new write-only I/O stream, which converts any text in the encoding `from`
179180
written to it into text in the encoding `to` written to ostream. Calling `close` on the
180181
stream is necessary to complete the encoding (but does not close `ostream`).
182+
183+
`to` and `from` can be specified either as a string or as an `Encoding` object.
181184
"""
182-
function StringEncoder(ostream::IO, to::ASCIIString, from::ASCIIString="UTF-8")
183-
cd = iconv_open(to, from)
185+
function StringEncoder(ostream::IO, to::Encoding, from::Encoding=enc"UTF-8")
186+
cd = iconv_open(ASCIIString(to), ASCIIString(from))
184187
inbuf = Vector{UInt8}(BUFSIZE)
185188
outbuf = Vector{UInt8}(BUFSIZE)
186189
s = StringEncoder(ostream, cd, inbuf, outbuf,
@@ -190,6 +193,11 @@ function StringEncoder(ostream::IO, to::ASCIIString, from::ASCIIString="UTF-8")
190193
s
191194
end
192195

196+
StringEncoder(ostream::IO, to::AbstractString, from::Encoding=enc"UTF-8") =
197+
StringEncoder(ostream, Encoding(to), from)
198+
StringEncoder(ostream::IO, to::AbstractString, from::AbstractString) =
199+
StringEncoder(ostream, Encoding(to), Encoding(from))
200+
193201
# Flush input buffer and convert it into output buffer
194202
# Returns the number of bytes written to output buffer
195203
function flush(s::StringEncoder)
@@ -226,16 +234,18 @@ end
226234
## StringDecoder
227235

228236
"""
229-
StringDecoder(istream, from, to="UTF-8")
237+
StringDecoder(istream, from::Encoding, to::Encoding=enc"UTF-8")
230238
231239
Returns a new read-only I/O stream, which converts text in the encoding `from`
232240
read from `istream` into text in the encoding `to`.
233241
242+
`to` and `from` can be specified either as a string or as an `Encoding` object.
243+
234244
Note that some implementations (notably the Windows one) may accept invalid sequences
235245
in the input data without raising an error.
236246
"""
237-
function StringDecoder(istream::IO, from::ASCIIString, to::ASCIIString="UTF-8")
238-
cd = iconv_open(to, from)
247+
function StringDecoder(istream::IO, from::Encoding, to::Encoding=enc"UTF-8")
248+
cd = iconv_open(ASCIIString(to), ASCIIString(from))
239249
inbuf = Vector{UInt8}(BUFSIZE)
240250
outbuf = Vector{UInt8}(BUFSIZE)
241251
s = StringDecoder(istream, cd, inbuf, outbuf,
@@ -245,6 +255,11 @@ function StringDecoder(istream::IO, from::ASCIIString, to::ASCIIString="UTF-8")
245255
s
246256
end
247257

258+
StringDecoder(istream::IO, from::AbstractString, to::Encoding=enc"UTF-8") =
259+
StringDecoder(istream, Encoding(from), to)
260+
StringDecoder(istream::IO, from::AbstractString, to::AbstractString) =
261+
StringDecoder(istream, Encoding(from), Encoding(to))
262+
248263
# Fill input buffer and convert it into output buffer
249264
# Returns the number of bytes written to output buffer
250265
function fill_buffer!(s::StringDecoder)
@@ -289,68 +304,67 @@ end
289304
## Convenience I/O functions
290305
if isdefined(Base, :readstring)
291306
@doc """
292-
readstring(stream or filename, enc::ASCIIString)
307+
readstring(stream or filename, enc::Encoding)
293308
294309
Read the entire contents of an I/O stream or a file in encoding `enc` as a string.
295310
""" ->
296-
Base.readstring(s::IO, enc::ASCIIString) = readstring(StringDecoder(s, enc))
297-
Base.readstring(filename::AbstractString, enc::ASCIIString) = open(io->readstring(io, enc), filename)
311+
Base.readstring(s::IO, enc::Encoding) = readstring(StringDecoder(s, enc))
312+
Base.readstring(filename::AbstractString, enc::Encoding) = open(io->readstring(io, enc), filename)
298313
else # Compatibility with Julia 0.4
299314
@doc """
300-
readall(stream or filename, enc::ASCIIString)
315+
readall(stream or filename, enc::Encoding)
301316
302317
Read the entire contents of an I/O stream or a file in encoding `enc` as a string.
303318
""" ->
304-
Base.readall(s::IO, enc::ASCIIString) = readall(StringDecoder(s, enc))
305-
Base.readall(filename::AbstractString, enc::ASCIIString) = open(io->readall(io, enc), filename)
319+
Base.readall(s::IO, enc::Encoding) = readall(StringDecoder(s, enc))
320+
Base.readall(filename::AbstractString, enc::Encoding) = open(io->readall(io, enc), filename)
306321
end
307322

308323

309324
## Functions to encode/decode strings
310325

311-
encoding_string(::Type{ASCIIString}) = "ASCII"
312-
encoding_string(::Type{UTF8String}) = "UTF-8"
313-
encoding_string(::Type{UTF16String}) = (ENDIAN_BOM == 0x04030201) ? "UTF-16LE" : "UTF-16BE"
314-
encoding_string(::Type{UTF32String}) = (ENDIAN_BOM == 0x04030201) ? "UTF-32LE" : "UTF-32BE"
315-
316326
"""
317-
decode([T,] a::Vector{UInt8}, enc::ASCIIString)
327+
decode([T,] a::Vector{UInt8}, enc)
318328
319329
Convert an array of bytes `a` representing text in encoding `enc` to a string of type `T`.
320330
By default, a `UTF8String` is returned.
321331
332+
`enc` can be specified either as a string or as an `Encoding` object.
333+
322334
Note that some implementations (notably the Windows one) may accept invalid sequences
323335
in the input data without raising an error.
324336
"""
325-
function decode{T<:AbstractString}(::Type{T}, a::Vector{UInt8}, enc::ASCIIString)
337+
function decode{T<:AbstractString}(::Type{T}, a::Vector{UInt8}, enc::Encoding)
326338
b = IOBuffer(a)
327339
try
328-
T(readbytes(StringDecoder(b, enc, encoding_string(T))))
340+
T(readbytes(StringDecoder(b, enc, encoding(T))))
329341
finally
330342
close(b)
331343
end
332344
end
333345

334-
decode(a::Vector{UInt8}, enc::ASCIIString) = decode(UTF8String, a, enc)
346+
decode{T<:AbstractString}(::Type{T}, a::Vector{UInt8}, enc::AbstractString) = decode(T, a, Encoding(enc))
347+
348+
decode(a::Vector{UInt8}, enc::AbstractString) = decode(UTF8String, a, Encoding(enc))
349+
decode(a::Vector{UInt8}, enc::Union{AbstractString, Encoding}) = decode(UTF8String, a, enc)
335350

336351
"""
337-
encode(s::AbstractString, enc::ASCIIString)
352+
encode(s::AbstractString, enc)
338353
339354
Convert string `s` to an array of bytes representing text in encoding `enc`.
355+
`enc` can be specified either as a string or as an `Encoding` object.
340356
"""
341-
function encode(s::AbstractString, enc::ASCIIString)
357+
function encode(s::AbstractString, enc::Encoding)
342358
b = IOBuffer()
343-
p = StringEncoder(b, enc, encoding_string(typeof(s)))
359+
p = StringEncoder(b, enc, encoding(typeof(s)))
344360
write(p, s)
345361
close(p)
346362
takebuf_array(b)
347363
end
348364

365+
encode(s::AbstractString, enc::AbstractString) = encode(s, Encoding(enc))
349366

350-
## Function to list supported encodings
351-
include("encodings.jl")
352-
353-
function test_encoding(enc)
367+
function test_encoding(enc::ASCIIString)
354368
# We assume that an encoding is supported if it's possible to convert from it to UTF-8:
355369
cd = ccall((:iconv_open, libiconv), Ptr{Void}, (Cstring, Cstring), enc, "UTF-8")
356370
if cd == Ptr{Void}(-1)

0 commit comments

Comments
 (0)