Skip to content

Commit 1671897

Browse files
committedFeb 14, 2016
WIP: store a list of encodings and their properties
1 parent 4c83568 commit 1671897

File tree

1 file changed

+93
-0
lines changed

1 file changed

+93
-0
lines changed
 

‎src/encodings.jl

+93
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,23 @@ end
3333

3434
## Functions giving information about a particular encoding
3535

36+
# NO_ENDIAN: insensitive to endianness
37+
# BIG_ENDIAN: default to big-endian
38+
# LOW_ENDIAN: default to big-endian
Has conversations. Original line has conversations.
39+
# BIG_ENDIAN_AUTO: endianness detection using BOM on input, defaults to big-endian on output
40+
# LOW_ENDIAN_AUTO: endianness detection using BOM on input, defaults to low-endian on output
41+
# NATIVE_ENDIAN_AUTO: endianness detection using BOM on input, defaults to native-endian on output
42+
@enum Endianness NO_ENDIAN BIG_ENDIAN LOW_ENDIAN BIG_ENDIAN_AUTO LOW_ENDIAN_AUTO NATIVE_ENDIAN_AUTO
43+
44+
immutable EncodingInfo
45+
name::ASCIIString
46+
codeunit::Int8 # Number of bytes per codeunit
47+
codepoint::Int8 # Number of bytes per codepoint; for MBCS, negative values give the maximum number of bytes
Has conversations. Original line has conversations.
48+
lowendian::Endianness # Endianness, if applicable
49+
ascii::Bool # Is the encoding a superset of ASCII?
50+
unicode::Bool # Is the encoding Unicode-compatible?
Has conversations. Original line has conversations.
51+
end
52+
3653
"""
3754
native_endian(enc)
3855
@@ -87,10 +104,86 @@ end
87104

88105
codeunit(enc::AbstractString) = codeunit(Encoding(enc))
89106

107+
const encodings_list2 = EncodingInfo[
108+
EncodingInfo("ASCII", 1, 1, NO_ENDIAN, true, true),
109+
110+
# Unicode encodings
111+
EncodingInfo("UTF-8", 1, -4, NO_ENDIAN, true, true),
112+
EncodingInfo("UTF-16", 2, -2, BIG_ENDIAN_AUTO, false, true), # FIXME: iconv implementations vary regarding endianness
Has a conversation. Original line has a conversation.
113+
EncodingInfo("UTF-16LE", 2, -2, LOW_ENDIAN, false, true),
114+
EncodingInfo("UTF-16BE", 2, -2, BIG_ENDIAN, false, true),
115+
EncodingInfo("UTF-32", 4, 1, BIG_ENDIAN_AUTO, false, true), # FIXME: iconv implementations vary regarding endianness
116+
EncodingInfo("UTF-32LE", 4, 1, LOW_ENDIAN, false, true),
117+
EncodingInfo("UTF-32BE", 4, 1, BIG_ENDIAN, false, true),
118+
119+
EncodingInfo("UCS-2", 2, 1, BIG_ENDIAN_AUTO, false, true), # FIXME: iconv implementations vary regarding endianness
120+
EncodingInfo("UCS-2LE", 2, 1, LOW_ENDIAN, false, true),
121+
EncodingInfo("UCS-2BE", 2, 1, BIG_ENDIAN, false, true),
122+
123+
# ISO-8859
124+
EncodingInfo("ISO-8869-1", 1, 1, NO_ENDIAN, true, true),
125+
EncodingInfo("ISO-8869-2", 1, 1, NO_ENDIAN, true, true),
Has conversations. Original line has conversations.
126+
EncodingInfo("ISO-8869-3", 1, 1, NO_ENDIAN, true, true),
127+
EncodingInfo("ISO-8869-4", 1, 1, NO_ENDIAN, true, true),
128+
EncodingInfo("ISO-8869-5", 1, 1, NO_ENDIAN, true, true),
129+
EncodingInfo("ISO-8869-6", 1, 1, NO_ENDIAN, true, true),
130+
EncodingInfo("ISO-8869-7", 1, 1, NO_ENDIAN, true, true),
131+
EncodingInfo("ISO-8869-8", 1, 1, NO_ENDIAN, true, true),
132+
EncodingInfo("ISO-8869-9", 1, 1, NO_ENDIAN, true, true),
133+
EncodingInfo("ISO-8869-10", 1, 1, NO_ENDIAN, true, true),
134+
EncodingInfo("ISO-8869-11", 1, 1, NO_ENDIAN, true, true),
135+
EncodingInfo("ISO-8869-12", 1, 1, NO_ENDIAN, true, true),
136+
EncodingInfo("ISO-8869-13", 1, 1, NO_ENDIAN, true, true),
137+
EncodingInfo("ISO-8869-14", 1, 1, NO_ENDIAN, true, true),
138+
EncodingInfo("ISO-8869-15", 1, 1, NO_ENDIAN, true, true),
139+
EncodingInfo("ISO-8869-16", 1, 1, NO_ENDIAN, true, true),
140+
141+
# KOI8 codepages
142+
EncodingInfo("KOI8-R", 1, 1, NO_ENDIAN, true, true),
143+
EncodingInfo("KOI8-U", 1, 1, NO_ENDIAN, true, true),
144+
EncodingInfo("KOI8-RU", 1, 1, NO_ENDIAN, true, true),
145+
146+
# 8-bit Windows codepages
147+
EncodingInfo("CP1250", 1, 1, NO_ENDIAN, true, true),
148+
EncodingInfo("CP1251", 1, 1, NO_ENDIAN, true, true),
149+
EncodingInfo("CP1252", 1, 1, NO_ENDIAN, true, true),
150+
EncodingInfo("CP1253", 1, 1, NO_ENDIAN, true, true),
151+
EncodingInfo("CP1254", 1, 1, NO_ENDIAN, true, true),
152+
EncodingInfo("CP1255", 1, 1, NO_ENDIAN, true, true),
153+
EncodingInfo("CP1256", 1, 1, NO_ENDIAN, true, true),
154+
EncodingInfo("CP1257", 1, 1, NO_ENDIAN, true, true),
155+
EncodingInfo("CP1258", 1, 1, NO_ENDIAN, true, true),
156+
157+
# DOS 8-bit codepages
158+
EncodingInfo("CP850", 1, 1, NO_ENDIAN, true, true),
159+
EncodingInfo("CP866", 1, 1, NO_ENDIAN, true, true),
160+
161+
# Mac 8-bit codepages
162+
EncodingInfo("MacRoman", 1, 1, NO_ENDIAN, true, true),
163+
EncodingInfo("MacCentralEurope", 1, 1, NO_ENDIAN, true, true),
164+
EncodingInfo("MacIceland", 1, 1, NO_ENDIAN, true, true),
165+
EncodingInfo("MacCroatian", 1, 1, NO_ENDIAN, true, true),
166+
EncodingInfo("MacRomania", 1, 1, NO_ENDIAN, true, true),
167+
EncodingInfo("MacCyrillic", 1, 1, NO_ENDIAN, true, true),
168+
EncodingInfo("MacUkraine", 1, 1, NO_ENDIAN, true, true),
169+
EncodingInfo("MacGreek", 1, 1, NO_ENDIAN, true, true),
170+
EncodingInfo("MacTurkish", 1, 1, NO_ENDIAN, true, true),
171+
EncodingInfo("MacHebrew", 1, 1, NO_ENDIAN, true, true),
172+
EncodingInfo("MacArabic", 1, 1, NO_ENDIAN, true, true),
173+
EncodingInfo("MacThai", 1, 1, NO_ENDIAN, true, true),
174+
175+
# Other 8-bit codepages
176+
EncodingInfo("HP-ROMAN8", 1, 1, NO_ENDIAN, true, true),
177+
EncodingInfo("NEXTSTEP", 1, 1, NO_ENDIAN, true, true)
178+
179+
# TODO: other encodings (8-bit and others)
180+
]
181+
90182

91183
## Lists of all known encodings taken from various iconv implementations,
92184
## including different aliases for the same encoding
93185

186+
94187
# 8-bit codeunit encodings
95188
const encodings8 = [
96189
"ASCII", "US-ASCII", "us-ascii", "CSASCII",

0 commit comments

Comments
 (0)
Please sign in to comment.