|
33 | 33 |
|
34 | 34 | ## Functions giving information about a particular encoding
|
35 | 35 |
|
| 36 | +# NO_ENDIAN: insensitive to endianness |
| 37 | +# BIG_ENDIAN: default to big-endian |
| 38 | +# LOW_ENDIAN: default to big-endian Has conversations. Original line has conversations. |
| 39 | +# BIG_ENDIAN_AUTO: endianness detection using BOM on input, defaults to big-endian on output |
| 40 | +# LOW_ENDIAN_AUTO: endianness detection using BOM on input, defaults to low-endian on output |
| 41 | +# NATIVE_ENDIAN_AUTO: endianness detection using BOM on input, defaults to native-endian on output |
| 42 | +@enum Endianness NO_ENDIAN BIG_ENDIAN LOW_ENDIAN BIG_ENDIAN_AUTO LOW_ENDIAN_AUTO NATIVE_ENDIAN_AUTO |
| 43 | + |
| 44 | +immutable EncodingInfo |
| 45 | + name::ASCIIString |
| 46 | + codeunit::Int8 # Number of bytes per codeunit |
| 47 | + codepoint::Int8 # Number of bytes per codepoint; for MBCS, negative values give the maximum number of bytes Has conversations. Original line has conversations. |
| 48 | + lowendian::Endianness # Endianness, if applicable |
| 49 | + ascii::Bool # Is the encoding a superset of ASCII? |
| 50 | + unicode::Bool # Is the encoding Unicode-compatible? Has conversations. Original line has conversations. |
| 51 | +end |
| 52 | + |
36 | 53 | """
|
37 | 54 | native_endian(enc)
|
38 | 55 |
|
|
87 | 104 |
|
88 | 105 | codeunit(enc::AbstractString) = codeunit(Encoding(enc))
|
89 | 106 |
|
| 107 | +const encodings_list2 = EncodingInfo[ |
| 108 | + EncodingInfo("ASCII", 1, 1, NO_ENDIAN, true, true), |
| 109 | + |
| 110 | + # Unicode encodings |
| 111 | + EncodingInfo("UTF-8", 1, -4, NO_ENDIAN, true, true), |
| 112 | + EncodingInfo("UTF-16", 2, -2, BIG_ENDIAN_AUTO, false, true), # FIXME: iconv implementations vary regarding endianness Has a conversation. Original line has a conversation. |
| 113 | + EncodingInfo("UTF-16LE", 2, -2, LOW_ENDIAN, false, true), |
| 114 | + EncodingInfo("UTF-16BE", 2, -2, BIG_ENDIAN, false, true), |
| 115 | + EncodingInfo("UTF-32", 4, 1, BIG_ENDIAN_AUTO, false, true), # FIXME: iconv implementations vary regarding endianness |
| 116 | + EncodingInfo("UTF-32LE", 4, 1, LOW_ENDIAN, false, true), |
| 117 | + EncodingInfo("UTF-32BE", 4, 1, BIG_ENDIAN, false, true), |
| 118 | + |
| 119 | + EncodingInfo("UCS-2", 2, 1, BIG_ENDIAN_AUTO, false, true), # FIXME: iconv implementations vary regarding endianness |
| 120 | + EncodingInfo("UCS-2LE", 2, 1, LOW_ENDIAN, false, true), |
| 121 | + EncodingInfo("UCS-2BE", 2, 1, BIG_ENDIAN, false, true), |
| 122 | + |
| 123 | + # ISO-8859 |
| 124 | + EncodingInfo("ISO-8869-1", 1, 1, NO_ENDIAN, true, true), |
| 125 | + EncodingInfo("ISO-8869-2", 1, 1, NO_ENDIAN, true, true), Has conversations. Original line has conversations. |
| 126 | + EncodingInfo("ISO-8869-3", 1, 1, NO_ENDIAN, true, true), |
| 127 | + EncodingInfo("ISO-8869-4", 1, 1, NO_ENDIAN, true, true), |
| 128 | + EncodingInfo("ISO-8869-5", 1, 1, NO_ENDIAN, true, true), |
| 129 | + EncodingInfo("ISO-8869-6", 1, 1, NO_ENDIAN, true, true), |
| 130 | + EncodingInfo("ISO-8869-7", 1, 1, NO_ENDIAN, true, true), |
| 131 | + EncodingInfo("ISO-8869-8", 1, 1, NO_ENDIAN, true, true), |
| 132 | + EncodingInfo("ISO-8869-9", 1, 1, NO_ENDIAN, true, true), |
| 133 | + EncodingInfo("ISO-8869-10", 1, 1, NO_ENDIAN, true, true), |
| 134 | + EncodingInfo("ISO-8869-11", 1, 1, NO_ENDIAN, true, true), |
| 135 | + EncodingInfo("ISO-8869-12", 1, 1, NO_ENDIAN, true, true), |
| 136 | + EncodingInfo("ISO-8869-13", 1, 1, NO_ENDIAN, true, true), |
| 137 | + EncodingInfo("ISO-8869-14", 1, 1, NO_ENDIAN, true, true), |
| 138 | + EncodingInfo("ISO-8869-15", 1, 1, NO_ENDIAN, true, true), |
| 139 | + EncodingInfo("ISO-8869-16", 1, 1, NO_ENDIAN, true, true), |
| 140 | + |
| 141 | + # KOI8 codepages |
| 142 | + EncodingInfo("KOI8-R", 1, 1, NO_ENDIAN, true, true), |
| 143 | + EncodingInfo("KOI8-U", 1, 1, NO_ENDIAN, true, true), |
| 144 | + EncodingInfo("KOI8-RU", 1, 1, NO_ENDIAN, true, true), |
| 145 | + |
| 146 | + # 8-bit Windows codepages |
| 147 | + EncodingInfo("CP1250", 1, 1, NO_ENDIAN, true, true), |
| 148 | + EncodingInfo("CP1251", 1, 1, NO_ENDIAN, true, true), |
| 149 | + EncodingInfo("CP1252", 1, 1, NO_ENDIAN, true, true), |
| 150 | + EncodingInfo("CP1253", 1, 1, NO_ENDIAN, true, true), |
| 151 | + EncodingInfo("CP1254", 1, 1, NO_ENDIAN, true, true), |
| 152 | + EncodingInfo("CP1255", 1, 1, NO_ENDIAN, true, true), |
| 153 | + EncodingInfo("CP1256", 1, 1, NO_ENDIAN, true, true), |
| 154 | + EncodingInfo("CP1257", 1, 1, NO_ENDIAN, true, true), |
| 155 | + EncodingInfo("CP1258", 1, 1, NO_ENDIAN, true, true), |
| 156 | + |
| 157 | + # DOS 8-bit codepages |
| 158 | + EncodingInfo("CP850", 1, 1, NO_ENDIAN, true, true), |
| 159 | + EncodingInfo("CP866", 1, 1, NO_ENDIAN, true, true), |
| 160 | + |
| 161 | + # Mac 8-bit codepages |
| 162 | + EncodingInfo("MacRoman", 1, 1, NO_ENDIAN, true, true), |
| 163 | + EncodingInfo("MacCentralEurope", 1, 1, NO_ENDIAN, true, true), |
| 164 | + EncodingInfo("MacIceland", 1, 1, NO_ENDIAN, true, true), |
| 165 | + EncodingInfo("MacCroatian", 1, 1, NO_ENDIAN, true, true), |
| 166 | + EncodingInfo("MacRomania", 1, 1, NO_ENDIAN, true, true), |
| 167 | + EncodingInfo("MacCyrillic", 1, 1, NO_ENDIAN, true, true), |
| 168 | + EncodingInfo("MacUkraine", 1, 1, NO_ENDIAN, true, true), |
| 169 | + EncodingInfo("MacGreek", 1, 1, NO_ENDIAN, true, true), |
| 170 | + EncodingInfo("MacTurkish", 1, 1, NO_ENDIAN, true, true), |
| 171 | + EncodingInfo("MacHebrew", 1, 1, NO_ENDIAN, true, true), |
| 172 | + EncodingInfo("MacArabic", 1, 1, NO_ENDIAN, true, true), |
| 173 | + EncodingInfo("MacThai", 1, 1, NO_ENDIAN, true, true), |
| 174 | + |
| 175 | + # Other 8-bit codepages |
| 176 | + EncodingInfo("HP-ROMAN8", 1, 1, NO_ENDIAN, true, true), |
| 177 | + EncodingInfo("NEXTSTEP", 1, 1, NO_ENDIAN, true, true) |
| 178 | + |
| 179 | + # TODO: other encodings (8-bit and others) |
| 180 | + ] |
| 181 | + |
90 | 182 |
|
91 | 183 | ## Lists of all known encodings taken from various iconv implementations,
|
92 | 184 | ## including different aliases for the same encoding
|
93 | 185 |
|
| 186 | + |
94 | 187 | # 8-bit codeunit encodings
|
95 | 188 | const encodings8 = [
|
96 | 189 | "ASCII", "US-ASCII", "us-ascii", "CSASCII",
|
|