File: | src/mod/xml_int/mod_xml_rpc/../../../../libs/xmlrpc-c/lib/expat/xmltok/xmltok.c |
Location: | line 1248, column 17 |
Description: | Assigned value is garbage or undefined |
1 | /* | |||
2 | Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd | |||
3 | See the file copying.txt for copying permission. | |||
4 | */ | |||
5 | ||||
6 | #include "xmlrpc_config.h" | |||
7 | #include "bool.h" | |||
8 | #include "xmldef.h" | |||
9 | #include "xmltok.h" | |||
10 | #include "nametab.h" | |||
11 | ||||
12 | #define IGNORE_SECTION_TOK_VTABLE, PREFIX(ignoreSectionTok) , PREFIX(ignoreSectionTok) | |||
13 | ||||
14 | #define VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId) \ | |||
15 | { PREFIX(prologTok), PREFIX(contentTok), \ | |||
16 | PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE, PREFIX(ignoreSectionTok) }, \ | |||
17 | { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ | |||
18 | PREFIX(sameName), \ | |||
19 | PREFIX(nameMatchesAscii), \ | |||
20 | PREFIX(nameLength), \ | |||
21 | PREFIX(skipS), \ | |||
22 | PREFIX(getAtts), \ | |||
23 | PREFIX(charRefNumber), \ | |||
24 | PREFIX(predefinedEntityName), \ | |||
25 | PREFIX(updatePosition), \ | |||
26 | PREFIX(isPublicId) | |||
27 | ||||
28 | #define VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) | |||
29 | ||||
30 | #define UCS2_GET_NAMING(pages, hi, lo)(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) \ | |||
31 | (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) | |||
32 | ||||
33 | /* A 2 byte UTF-8 representation splits the characters 11 bits | |||
34 | between the bottom 5 and 6 bits of the bytes. | |||
35 | We need 8 bits to index into pages, 3 bits to add to that index and | |||
36 | 5 bits to generate the mask. */ | |||
37 | #define UTF8_GET_NAMING2(pages, byte)(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] & (1 << (((byte)[1]) & 0x1F))) \ | |||
38 | (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ | |||
39 | + ((((byte)[0]) & 3) << 1) \ | |||
40 | + ((((byte)[1]) >> 5) & 1)] \ | |||
41 | & (1 << (((byte)[1]) & 0x1F))) | |||
42 | ||||
43 | /* A 3 byte UTF-8 representation splits the characters 16 bits | |||
44 | between the bottom 4, 6 and 6 bits of the bytes. | |||
45 | We need 8 bits to index into pages, 3 bits to add to that index and | |||
46 | 5 bits to generate the mask. */ | |||
47 | #define UTF8_GET_NAMING3(pages, byte)(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] << 3) + ((((byte )[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1 )] & (1 << (((byte)[2]) & 0x1F))) \ | |||
48 | (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ | |||
49 | + ((((byte)[1]) >> 2) & 0xF)] \ | |||
50 | << 3) \ | |||
51 | + ((((byte)[1]) & 3) << 1) \ | |||
52 | + ((((byte)[2]) >> 5) & 1)] \ | |||
53 | & (1 << (((byte)[2]) & 0x1F))) | |||
54 | ||||
55 | #define UTF8_GET_NAMING(pages, p, n)((n) == 2 ? (namingBitmap[((pages)[((((const unsigned char *) (p))[0]) >> 2) & 7] << 3) + (((((const unsigned char *)(p))[0]) & 3) << 1) + (((((const unsigned char *)(p))[1]) >> 5) & 1)] & (1 << ((((const unsigned char *)(p))[1]) & 0x1F))) : ((n) == 3 ? (namingBitmap [((pages)[(((((const unsigned char *)(p))[0]) & 0xF) << 4) + (((((const unsigned char *)(p))[1]) >> 2) & 0xF )] << 3) + (((((const unsigned char *)(p))[1]) & 3) << 1) + (((((const unsigned char *)(p))[2]) >> 5 ) & 1)] & (1 << ((((const unsigned char *)(p))[ 2]) & 0x1F))) : 0)) \ | |||
56 | ((n) == 2 \ | |||
57 | ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))(namingBitmap[((pages)[((((const unsigned char *)(p))[0]) >> 2) & 7] << 3) + (((((const unsigned char *)(p))[0] ) & 3) << 1) + (((((const unsigned char *)(p))[1]) >> 5) & 1)] & (1 << ((((const unsigned char *)(p) )[1]) & 0x1F))) \ | |||
58 | : ((n) == 3 \ | |||
59 | ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p))(namingBitmap[((pages)[(((((const unsigned char *)(p))[0]) & 0xF) << 4) + (((((const unsigned char *)(p))[1]) >> 2) & 0xF)] << 3) + (((((const unsigned char *)(p)) [1]) & 3) << 1) + (((((const unsigned char *)(p))[2 ]) >> 5) & 1)] & (1 << ((((const unsigned char *)(p))[2]) & 0x1F))) \ | |||
60 | : 0)) | |||
61 | ||||
62 | #define UTF8_INVALID3(p)((*p) == 0xED ? (((p)[1] & 0x20) != 0) : ((*p) == 0xEF ? ( (p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) : 0)) \ | |||
63 | ((*p) == 0xED \ | |||
64 | ? (((p)[1] & 0x20) != 0) \ | |||
65 | : ((*p) == 0xEF \ | |||
66 | ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \ | |||
67 | : 0)) | |||
68 | ||||
69 | #define UTF8_INVALID4(p)((*p) == 0xF4 && ((p)[1] & 0x30) != 0) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0) | |||
70 | ||||
71 | static | |||
72 | int isNever(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), const char *p ATTR_UNUSED__attribute__((__unused__))) | |||
73 | { | |||
74 | return 0; | |||
75 | } | |||
76 | ||||
77 | static | |||
78 | int utf8_isName2(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), const char *p) | |||
79 | { | |||
80 | return UTF8_GET_NAMING2(namePages, (const unsigned char *)p)(namingBitmap[((namePages)[((((const unsigned char *)p)[0]) >> 2) & 7] << 3) + (((((const unsigned char *)p)[0]) & 3) << 1) + (((((const unsigned char *)p)[1]) >> 5 ) & 1)] & (1 << ((((const unsigned char *)p)[1] ) & 0x1F))); | |||
81 | } | |||
82 | ||||
83 | static | |||
84 | int utf8_isName3(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), const char *p) | |||
85 | { | |||
86 | return UTF8_GET_NAMING3(namePages, (const unsigned char *)p)(namingBitmap[((namePages)[(((((const unsigned char *)p)[0]) & 0xF) << 4) + (((((const unsigned char *)p)[1]) >> 2) & 0xF)] << 3) + (((((const unsigned char *)p)[1 ]) & 3) << 1) + (((((const unsigned char *)p)[2]) >> 5) & 1)] & (1 << ((((const unsigned char *)p)[ 2]) & 0x1F))); | |||
87 | } | |||
88 | ||||
89 | #define utf8_isName4isNever isNever | |||
90 | ||||
91 | static | |||
92 | int utf8_isNmstrt2(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), const char *p) | |||
93 | { | |||
94 | return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p)(namingBitmap[((nmstrtPages)[((((const unsigned char *)p)[0]) >> 2) & 7] << 3) + (((((const unsigned char * )p)[0]) & 3) << 1) + (((((const unsigned char *)p)[ 1]) >> 5) & 1)] & (1 << ((((const unsigned char *)p)[1]) & 0x1F))); | |||
95 | } | |||
96 | ||||
97 | static | |||
98 | int utf8_isNmstrt3(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), const char *p) | |||
99 | { | |||
100 | return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p)(namingBitmap[((nmstrtPages)[(((((const unsigned char *)p)[0] ) & 0xF) << 4) + (((((const unsigned char *)p)[1]) >> 2) & 0xF)] << 3) + (((((const unsigned char *)p)[1 ]) & 3) << 1) + (((((const unsigned char *)p)[2]) >> 5) & 1)] & (1 << ((((const unsigned char *)p)[ 2]) & 0x1F))); | |||
101 | } | |||
102 | ||||
103 | #define utf8_isNmstrt4isNever isNever | |||
104 | ||||
105 | #define utf8_isInvalid2isNever isNever | |||
106 | ||||
107 | static | |||
108 | int utf8_isInvalid3(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), const char *p) | |||
109 | { | |||
110 | return UTF8_INVALID3((const unsigned char *)p)((*(const unsigned char *)p) == 0xED ? ((((const unsigned char *)p)[1] & 0x20) != 0) : ((*(const unsigned char *)p) == 0xEF ? (((const unsigned char *)p)[1] == 0xBF && (((const unsigned char *)p)[2] == 0xBF || ((const unsigned char *)p)[ 2] == 0xBE)) : 0)); | |||
111 | } | |||
112 | ||||
113 | static | |||
114 | int utf8_isInvalid4(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), const char *p) | |||
115 | { | |||
116 | return UTF8_INVALID4((const unsigned char *)p)((*(const unsigned char *)p) == 0xF4 && (((const unsigned char *)p)[1] & 0x30) != 0); | |||
117 | } | |||
118 | ||||
119 | struct normal_encoding { | |||
120 | ENCODING enc; | |||
121 | unsigned char type[256]; | |||
122 | #ifdef XML_MIN_SIZE | |||
123 | int (*byteType)(const ENCODING *, const char *); | |||
124 | int (*isNameMin)(const ENCODING *, const char *); | |||
125 | int (*isNmstrtMin)(const ENCODING *, const char *); | |||
126 | int (*byteToAscii)(const ENCODING *, const char *); | |||
127 | int (*charMatches)(const ENCODING *, const char *, int); | |||
128 | #endif /* XML_MIN_SIZE */ | |||
129 | int (*isName2)(const ENCODING *, const char *); | |||
130 | int (*isName3)(const ENCODING *, const char *); | |||
131 | int (*isName4)(const ENCODING *, const char *); | |||
132 | int (*isNmstrt2)(const ENCODING *, const char *); | |||
133 | int (*isNmstrt3)(const ENCODING *, const char *); | |||
134 | int (*isNmstrt4)(const ENCODING *, const char *); | |||
135 | int (*isInvalid2)(const ENCODING *, const char *); | |||
136 | int (*isInvalid3)(const ENCODING *, const char *); | |||
137 | int (*isInvalid4)(const ENCODING *, const char *); | |||
138 | }; | |||
139 | ||||
140 | #ifdef XML_MIN_SIZE | |||
141 | ||||
142 | #define STANDARD_VTABLE(E) \ | |||
143 | E ## byteType, \ | |||
144 | E ## isNameMin, \ | |||
145 | E ## isNmstrtMin, \ | |||
146 | E ## byteToAscii, \ | |||
147 | E ## charMatches, | |||
148 | ||||
149 | #else | |||
150 | ||||
151 | #define STANDARD_VTABLE(E) /* as nothing */ | |||
152 | ||||
153 | #endif | |||
154 | ||||
155 | #define NORMAL_VTABLE(E)EisName2, EisName3, EisName4, EisNmstrt2, EisNmstrt3, EisNmstrt4 , EisInvalid2, EisInvalid3, EisInvalid4 \ | |||
156 | E ## isName2, \ | |||
157 | E ## isName3, \ | |||
158 | E ## isName4, \ | |||
159 | E ## isNmstrt2, \ | |||
160 | E ## isNmstrt3, \ | |||
161 | E ## isNmstrt4, \ | |||
162 | E ## isInvalid2, \ | |||
163 | E ## isInvalid3, \ | |||
164 | E ## isInvalid4 | |||
165 | ||||
166 | #define NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) \ | |||
167 | NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) | |||
168 | ||||
169 | static int checkCharRefNumber(int); | |||
170 | ||||
171 | #include "xmltok_impl.h" | |||
172 | #include "ascii.h" | |||
173 | ||||
174 | #ifdef XML_MIN_SIZE | |||
175 | #define sb_isNameMin isNever | |||
176 | #define sb_isNmstrtMin isNever | |||
177 | #endif | |||
178 | ||||
179 | #ifdef XML_MIN_SIZE | |||
180 | #define MINBPC(enc) ((enc)->minBytesPerChar) | |||
181 | #else | |||
182 | /* minimum bytes per character */ | |||
183 | #define MINBPC(enc) 1 | |||
184 | #endif | |||
185 | ||||
186 | #define SB_BYTE_TYPE(enc, p)(((struct normal_encoding *)(enc))->type[(unsigned char)*( p)]) \ | |||
187 | (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) | |||
188 | ||||
189 | #ifdef XML_MIN_SIZE | |||
190 | static | |||
191 | int sb_byteType(const ENCODING *enc, const char *p) | |||
192 | { | |||
193 | return SB_BYTE_TYPE(enc, p)(((struct normal_encoding *)(enc))->type[(unsigned char)*( p)]); | |||
194 | } | |||
195 | #define BYTE_TYPE(enc, p) \ | |||
196 | (((const struct normal_encoding *)(enc))->byteType(enc, p)) | |||
197 | #else | |||
198 | #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)(((struct normal_encoding *)(enc))->type[(unsigned char)*( p)]) | |||
199 | #endif | |||
200 | ||||
201 | #ifdef XML_MIN_SIZE | |||
202 | #define BYTE_TO_ASCII(enc, p) \ | |||
203 | (((const struct normal_encoding *)(enc))->byteToAscii(enc, p)) | |||
204 | static | |||
205 | int sb_byteToAscii(const ENCODING *enc, const char *p) | |||
206 | { | |||
207 | return *p; | |||
208 | } | |||
209 | #else | |||
210 | #define BYTE_TO_ASCII(enc, p) (*(p)) | |||
211 | #endif | |||
212 | ||||
213 | #define IS_NAME_CHAR(enc, p, n) \ | |||
214 | (((const struct normal_encoding *)(enc))->isName ## n(enc, p)) | |||
215 | #define IS_NMSTRT_CHAR(enc, p, n) \ | |||
216 | (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p)) | |||
217 | #define IS_INVALID_CHAR(enc, p, n) \ | |||
218 | (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p)) | |||
219 | ||||
220 | #ifdef XML_MIN_SIZE | |||
221 | #define IS_NAME_CHAR_MINBPC(enc, p) \ | |||
222 | (((const struct normal_encoding *)(enc))->isNameMin(enc, p)) | |||
223 | #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ | |||
224 | (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p)) | |||
225 | #else | |||
226 | #define IS_NAME_CHAR_MINBPC(enc, p) (0) | |||
227 | #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) | |||
228 | #endif | |||
229 | ||||
230 | #ifdef XML_MIN_SIZE | |||
231 | #define CHAR_MATCHES(enc, p, c) \ | |||
232 | (((const struct normal_encoding *)(enc))->charMatches(enc, p, c)) | |||
233 | static | |||
234 | int sb_charMatches(const ENCODING *enc, const char *p, int c) | |||
235 | { | |||
236 | return *p == c; | |||
237 | } | |||
238 | #else | |||
239 | /* c is an ASCII character */ | |||
240 | #define CHAR_MATCHES(enc, p, c) (*(p) == c) | |||
241 | #endif | |||
242 | ||||
243 | #define PREFIX(ident) normal_ ## ident | |||
244 | #include "xmltok_impl.c" | |||
245 | ||||
246 | #undef MINBPC | |||
247 | #undef BYTE_TYPE | |||
248 | #undef BYTE_TO_ASCII | |||
249 | #undef CHAR_MATCHES | |||
250 | #undef IS_NAME_CHAR | |||
251 | #undef IS_NAME_CHAR_MINBPC | |||
252 | #undef IS_NMSTRT_CHAR | |||
253 | #undef IS_NMSTRT_CHAR_MINBPC | |||
254 | #undef IS_INVALID_CHAR | |||
255 | ||||
256 | enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ | |||
257 | UTF8_cval1 = 0x00, | |||
258 | UTF8_cval2 = 0xc0, | |||
259 | UTF8_cval3 = 0xe0, | |||
260 | UTF8_cval4 = 0xf0 | |||
261 | }; | |||
262 | ||||
263 | static | |||
264 | void utf8_toUtf8(const ENCODING * enc ATTR_UNUSED__attribute__((__unused__)), | |||
265 | const char **fromP, const char *fromLim, | |||
266 | char **toP, const char *toLim) | |||
267 | { | |||
268 | char *to; | |||
269 | const char *from; | |||
270 | if (fromLim - *fromP > toLim - *toP) { | |||
271 | /* Avoid copying partial characters. */ | |||
272 | for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) | |||
273 | if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) | |||
274 | break; | |||
275 | } | |||
276 | for (to = *toP, from = *fromP; from != fromLim; from++, to++) | |||
277 | *to = *from; | |||
278 | *fromP = from; | |||
279 | *toP = to; | |||
280 | } | |||
281 | ||||
282 | static | |||
283 | void utf8_toUtf16(const ENCODING *enc, | |||
284 | const char **fromP, const char *fromLim, | |||
285 | unsigned short **toP, const unsigned short *toLim) | |||
286 | { | |||
287 | unsigned short *to = *toP; | |||
288 | const char *from = *fromP; | |||
289 | while (from != fromLim && to != toLim) { | |||
290 | switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { | |||
291 | case BT_LEAD2: | |||
292 | *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f); | |||
293 | from += 2; | |||
294 | break; | |||
295 | case BT_LEAD3: | |||
296 | *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f); | |||
297 | from += 3; | |||
298 | break; | |||
299 | case BT_LEAD4: | |||
300 | { | |||
301 | unsigned long n; | |||
302 | if (to + 1 == toLim) | |||
303 | break; | |||
304 | n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); | |||
305 | n -= 0x10000; | |||
306 | to[0] = (unsigned short)((n >> 10) | 0xD800); | |||
307 | to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); | |||
308 | to += 2; | |||
309 | from += 4; | |||
310 | } | |||
311 | break; | |||
312 | default: | |||
313 | *to++ = *from++; | |||
314 | break; | |||
315 | } | |||
316 | } | |||
317 | *fromP = from; | |||
318 | *toP = to; | |||
319 | } | |||
320 | ||||
321 | static const struct normal_encoding utf8_encoding_ns = { | |||
322 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |||
323 | { | |||
324 | #include "asciitab.h" | |||
325 | #include "utf8tab.h" | |||
326 | }, | |||
327 | STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3 , isNever, isNever, utf8_isInvalid3, utf8_isInvalid4 | |||
328 | }; | |||
329 | ||||
330 | static const struct normal_encoding utf8_encoding = { | |||
331 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |||
332 | { | |||
333 | #define BT_COLON BT_NMSTRT | |||
334 | #include "asciitab.h" | |||
335 | #undef BT_COLON | |||
336 | #include "utf8tab.h" | |||
337 | }, | |||
338 | STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3 , isNever, isNever, utf8_isInvalid3, utf8_isInvalid4 | |||
339 | }; | |||
340 | ||||
341 | static const struct normal_encoding internal_utf8_encoding_ns = { | |||
342 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |||
343 | { | |||
344 | #include "iasciitab.h" | |||
345 | #include "utf8tab.h" | |||
346 | }, | |||
347 | STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3 , isNever, isNever, utf8_isInvalid3, utf8_isInvalid4 | |||
348 | }; | |||
349 | ||||
350 | static const struct normal_encoding internal_utf8_encoding = { | |||
351 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |||
352 | { | |||
353 | #define BT_COLON BT_NMSTRT | |||
354 | #include "iasciitab.h" | |||
355 | #undef BT_COLON | |||
356 | #include "utf8tab.h" | |||
357 | }, | |||
358 | STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3 , isNever, isNever, utf8_isInvalid3, utf8_isInvalid4 | |||
359 | }; | |||
360 | ||||
361 | ||||
362 | ||||
363 | static void | |||
364 | latin1_toUtf8(const ENCODING * const enc ATTR_UNUSED__attribute__((__unused__)), | |||
365 | const char ** const fromP, | |||
366 | const char * const fromLim, | |||
367 | char ** const toP, | |||
368 | const char * const toLim) { | |||
369 | /*---------------------------------------------------------------------------- | |||
370 | Convert the Latin1 string that starts at *fromP and ends at 'fromLim' | |||
371 | to UTF8 in the buffer that starts at *toP and ends at 'toLim'. | |||
372 | ||||
373 | Go from left to right and stop when the output buffer is full. | |||
374 | ||||
375 | Note that the buffer can be full while still having a byte left in it | |||
376 | because a Latin1 character may require two bytes of the output buffer. | |||
377 | ||||
378 | Leave *fromP and *toP pointing after the last character converted. | |||
379 | -----------------------------------------------------------------------------*/ | |||
380 | bool bufferIsFull; | |||
381 | ||||
382 | for (bufferIsFull = false; *fromP != fromLim && !bufferIsFull;) { | |||
383 | unsigned char const c = (unsigned char)**fromP; | |||
384 | if (c & 0x80) { | |||
385 | if (toLim - *toP < 2) | |||
386 | bufferIsFull = true; | |||
387 | else { | |||
388 | *(*toP)++ = ((c >> 6) | UTF8_cval2); | |||
389 | *(*toP)++ = ((c & 0x3f) | 0x80); | |||
390 | ++(*fromP); | |||
391 | } | |||
392 | } else { | |||
393 | if (*toP == toLim) | |||
394 | bufferIsFull = true; | |||
395 | else | |||
396 | *(*toP)++ = *(*fromP)++; | |||
397 | } | |||
398 | } | |||
399 | } | |||
400 | ||||
401 | ||||
402 | ||||
403 | static | |||
404 | void latin1_toUtf16(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), | |||
405 | const char **fromP, const char *fromLim, | |||
406 | unsigned short **toP, const unsigned short *toLim) | |||
407 | { | |||
408 | while (*fromP != fromLim && *toP != toLim) | |||
409 | *(*toP)++ = (unsigned char)*(*fromP)++; | |||
410 | } | |||
411 | ||||
412 | static const struct normal_encoding latin1_encoding_ns = { | |||
413 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, | |||
414 | { | |||
415 | #include "asciitab.h" | |||
416 | #include "latin1tab.h" | |||
417 | }, | |||
418 | STANDARD_VTABLE(sb_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
419 | }; | |||
420 | ||||
421 | static const struct normal_encoding latin1_encoding = { | |||
422 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, | |||
423 | { | |||
424 | #define BT_COLON BT_NMSTRT | |||
425 | #include "asciitab.h" | |||
426 | #undef BT_COLON | |||
427 | #include "latin1tab.h" | |||
428 | }, | |||
429 | STANDARD_VTABLE(sb_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
430 | }; | |||
431 | ||||
432 | static | |||
433 | void ascii_toUtf8(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), | |||
434 | const char **fromP, const char *fromLim, | |||
435 | char **toP, const char *toLim) | |||
436 | { | |||
437 | while (*fromP != fromLim && *toP != toLim) | |||
438 | *(*toP)++ = *(*fromP)++; | |||
439 | } | |||
440 | ||||
441 | static const struct normal_encoding ascii_encoding_ns = { | |||
442 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, | |||
443 | { | |||
444 | #include "asciitab.h" | |||
445 | /* BT_NONXML == 0 */ | |||
446 | }, | |||
447 | STANDARD_VTABLE(sb_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
448 | }; | |||
449 | ||||
450 | static const struct normal_encoding ascii_encoding = { | |||
451 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, | |||
452 | { | |||
453 | #define BT_COLON BT_NMSTRT | |||
454 | #include "asciitab.h" | |||
455 | #undef BT_COLON | |||
456 | /* BT_NONXML == 0 */ | |||
457 | }, | |||
458 | STANDARD_VTABLE(sb_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
459 | }; | |||
460 | ||||
461 | static int unicode_byte_type(char hi, char lo) | |||
462 | { | |||
463 | switch ((unsigned char)hi) { | |||
464 | case 0xD8: case 0xD9: case 0xDA: case 0xDB: | |||
465 | return BT_LEAD4; | |||
466 | case 0xDC: case 0xDD: case 0xDE: case 0xDF: | |||
467 | return BT_TRAIL; | |||
468 | case 0xFF: | |||
469 | switch ((unsigned char)lo) { | |||
470 | case 0xFF: | |||
471 | case 0xFE: | |||
472 | return BT_NONXML; | |||
473 | } | |||
474 | break; | |||
475 | } | |||
476 | return BT_NONASCII; | |||
477 | } | |||
478 | ||||
479 | #define DEFINE_UTF16_TO_UTF8(E)static void EtoUtf8(const ENCODING *enc __attribute__((__unused__ )), const char **fromP, const char *fromLim, char **toP, const char *toLim) { const char *from; for (from = *fromP; from != fromLim; from += 2) { int plane; unsigned char lo2; unsigned char lo = GET_LO(from); unsigned char hi = GET_HI(from); switch (hi) { case 0: if (lo < 0x80) { if (*toP == toLim) { *fromP = from; return; } *(*toP)++ = lo; break; } case 0x1: case 0x2 : case 0x3: case 0x4: case 0x5: case 0x6: case 0x7: if (toLim - *toP < 2) { *fromP = from; return; } *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); *(*toP)++ = ((lo & 0x3f ) | 0x80); break; default: if (toLim - *toP < 3) { *fromP = from; return; } *(*toP)++ = ((hi >> 4) | UTF8_cval3); * (*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80 ); *(*toP)++ = ((lo & 0x3f) | 0x80); break; case 0xD8: case 0xD9: case 0xDA: case 0xDB: if (toLim - *toP < 4) { *fromP = from; return; } plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; *(*toP)++ = ((plane >> 2) | UTF8_cval4); *(*toP)++ = (((lo >> 2) & 0xF) | (( plane & 0x3) << 4) | 0x80); from += 2; lo2 = GET_LO (from); *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from ) & 0x3) << 2) | (lo2 >> 6) | 0x80); *(*toP)++ = ((lo2 & 0x3f) | 0x80); break; } } *fromP = from; } \ | |||
480 | static \ | |||
481 | void E ## toUtf8(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), \ | |||
482 | const char **fromP, const char *fromLim, \ | |||
483 | char **toP, const char *toLim) \ | |||
484 | { \ | |||
485 | const char *from; \ | |||
486 | for (from = *fromP; from != fromLim; from += 2) { \ | |||
487 | int plane; \ | |||
488 | unsigned char lo2; \ | |||
489 | unsigned char lo = GET_LO(from); \ | |||
490 | unsigned char hi = GET_HI(from); \ | |||
491 | switch (hi) { \ | |||
492 | case 0: \ | |||
493 | if (lo < 0x80) { \ | |||
494 | if (*toP == toLim) { \ | |||
495 | *fromP = from; \ | |||
496 | return; \ | |||
497 | } \ | |||
498 | *(*toP)++ = lo; \ | |||
499 | break; \ | |||
500 | } \ | |||
501 | /* fall through */ \ | |||
502 | case 0x1: case 0x2: case 0x3: \ | |||
503 | case 0x4: case 0x5: case 0x6: case 0x7: \ | |||
504 | if (toLim - *toP < 2) { \ | |||
505 | *fromP = from; \ | |||
506 | return; \ | |||
507 | } \ | |||
508 | *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ | |||
509 | *(*toP)++ = ((lo & 0x3f) | 0x80); \ | |||
510 | break; \ | |||
511 | default: \ | |||
512 | if (toLim - *toP < 3) { \ | |||
513 | *fromP = from; \ | |||
514 | return; \ | |||
515 | } \ | |||
516 | /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ | |||
517 | *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ | |||
518 | *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ | |||
519 | *(*toP)++ = ((lo & 0x3f) | 0x80); \ | |||
520 | break; \ | |||
521 | case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ | |||
522 | if (toLim - *toP < 4) { \ | |||
523 | *fromP = from; \ | |||
524 | return; \ | |||
525 | } \ | |||
526 | plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ | |||
527 | *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ | |||
528 | *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ | |||
529 | from += 2; \ | |||
530 | lo2 = GET_LO(from); \ | |||
531 | *(*toP)++ = (((lo & 0x3) << 4) \ | |||
532 | | ((GET_HI(from) & 0x3) << 2) \ | |||
533 | | (lo2 >> 6) \ | |||
534 | | 0x80); \ | |||
535 | *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ | |||
536 | break; \ | |||
537 | } \ | |||
538 | } \ | |||
539 | *fromP = from; \ | |||
540 | } | |||
541 | ||||
542 | #define DEFINE_UTF16_TO_UTF16(E)static void EtoUtf16(const ENCODING *enc __attribute__((__unused__ )), const char **fromP, const char *fromLim, unsigned short * *toP, const unsigned short *toLim) { if (fromLim - *fromP > ((toLim - *toP) << 1) && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) fromLim -= 2; for (; *fromP != fromLim && *toP != toLim; *fromP += 2) *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); } \ | |||
543 | static \ | |||
544 | void E ## toUtf16(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), \ | |||
545 | const char **fromP, const char *fromLim, \ | |||
546 | unsigned short **toP, const unsigned short *toLim) \ | |||
547 | { \ | |||
548 | /* Avoid copying first half only of surrogate */ \ | |||
549 | if (fromLim - *fromP > ((toLim - *toP) << 1) \ | |||
550 | && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ | |||
551 | fromLim -= 2; \ | |||
552 | for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ | |||
553 | *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ | |||
554 | } | |||
555 | ||||
556 | #define SET2(ptr, ch) \ | |||
557 | (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) | |||
558 | #define GET_LO(ptr) ((unsigned char)(ptr)[0]) | |||
559 | #define GET_HI(ptr) ((unsigned char)(ptr)[1]) | |||
560 | ||||
561 | DEFINE_UTF16_TO_UTF8(little2_)static void little2_toUtf8(const ENCODING *enc __attribute__( (__unused__)), const char **fromP, const char *fromLim, char * *toP, const char *toLim) { const char *from; for (from = *fromP ; from != fromLim; from += 2) { int plane; unsigned char lo2; unsigned char lo = GET_LO(from); unsigned char hi = GET_HI(from ); switch (hi) { case 0: if (lo < 0x80) { if (*toP == toLim ) { *fromP = from; return; } *(*toP)++ = lo; break; } case 0x1 : case 0x2: case 0x3: case 0x4: case 0x5: case 0x6: case 0x7: if (toLim - *toP < 2) { *fromP = from; return; } *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); *(*toP)++ = ((lo & 0x3f) | 0x80); break; default: if (toLim - *toP < 3) { *fromP = from; return; } *(*toP)++ = ((hi >> 4) | UTF8_cval3); *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); *(*toP)++ = ((lo & 0x3f) | 0x80 ); break; case 0xD8: case 0xD9: case 0xDA: case 0xDB: if (toLim - *toP < 4) { *fromP = from; return; } plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; *(*toP) ++ = ((plane >> 2) | UTF8_cval4); *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); from += 2; lo2 = GET_LO(from); *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) | (lo2 >> 6 ) | 0x80); *(*toP)++ = ((lo2 & 0x3f) | 0x80); break; } } * fromP = from; } | |||
562 | DEFINE_UTF16_TO_UTF16(little2_)static void little2_toUtf16(const ENCODING *enc __attribute__ ((__unused__)), const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { if (fromLim - *fromP > ((toLim - *toP) << 1) && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) fromLim -= 2; for (; *fromP != fromLim && *toP != toLim; *fromP += 2) *(*toP)++ = (GET_HI(* fromP) << 8) | GET_LO(*fromP); } | |||
563 | ||||
564 | #undef SET2 | |||
565 | #undef GET_LO | |||
566 | #undef GET_HI | |||
567 | ||||
568 | #define SET2(ptr, ch) \ | |||
569 | (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) | |||
570 | #define GET_LO(ptr) ((unsigned char)(ptr)[1]) | |||
571 | #define GET_HI(ptr) ((unsigned char)(ptr)[0]) | |||
572 | ||||
573 | DEFINE_UTF16_TO_UTF8(big2_)static void big2_toUtf8(const ENCODING *enc __attribute__((__unused__ )), const char **fromP, const char *fromLim, char **toP, const char *toLim) { const char *from; for (from = *fromP; from != fromLim; from += 2) { int plane; unsigned char lo2; unsigned char lo = GET_LO(from); unsigned char hi = GET_HI(from); switch (hi) { case 0: if (lo < 0x80) { if (*toP == toLim) { *fromP = from; return; } *(*toP)++ = lo; break; } case 0x1: case 0x2 : case 0x3: case 0x4: case 0x5: case 0x6: case 0x7: if (toLim - *toP < 2) { *fromP = from; return; } *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); *(*toP)++ = ((lo & 0x3f ) | 0x80); break; default: if (toLim - *toP < 3) { *fromP = from; return; } *(*toP)++ = ((hi >> 4) | UTF8_cval3); * (*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80 ); *(*toP)++ = ((lo & 0x3f) | 0x80); break; case 0xD8: case 0xD9: case 0xDA: case 0xDB: if (toLim - *toP < 4) { *fromP = from; return; } plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; *(*toP)++ = ((plane >> 2) | UTF8_cval4); *(*toP)++ = (((lo >> 2) & 0xF) | (( plane & 0x3) << 4) | 0x80); from += 2; lo2 = GET_LO (from); *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from ) & 0x3) << 2) | (lo2 >> 6) | 0x80); *(*toP)++ = ((lo2 & 0x3f) | 0x80); break; } } *fromP = from; } | |||
574 | DEFINE_UTF16_TO_UTF16(big2_)static void big2_toUtf16(const ENCODING *enc __attribute__((__unused__ )), const char **fromP, const char *fromLim, unsigned short * *toP, const unsigned short *toLim) { if (fromLim - *fromP > ((toLim - *toP) << 1) && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) fromLim -= 2; for (; *fromP != fromLim && *toP != toLim; *fromP += 2) *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); } | |||
575 | ||||
576 | #undef SET2 | |||
577 | #undef GET_LO | |||
578 | #undef GET_HI | |||
579 | ||||
580 | #define LITTLE2_BYTE_TYPE(enc, p)((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0])) \ | |||
581 | ((p)[1] == 0 \ | |||
582 | ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ | |||
583 | : unicode_byte_type((p)[1], (p)[0])) | |||
584 | #define LITTLE2_BYTE_TO_ASCII(enc, p)((p)[1] == 0 ? (p)[0] : -1) ((p)[1] == 0 ? (p)[0] : -1) | |||
585 | #define LITTLE2_CHAR_MATCHES(enc, p, c)((p)[1] == 0 && (p)[0] == c) ((p)[1] == 0 && (p)[0] == c) | |||
586 | #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + ( ((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) \ | |||
587 | UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + ( ((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) | |||
588 | #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) + (((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) \ | |||
589 | UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) + (((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) | |||
590 | ||||
591 | #ifdef XML_MIN_SIZE | |||
592 | ||||
593 | static | |||
594 | int little2_byteType(const ENCODING *enc, const char *p) | |||
595 | { | |||
596 | return LITTLE2_BYTE_TYPE(enc, p)((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0])); | |||
597 | } | |||
598 | ||||
599 | static | |||
600 | int little2_byteToAscii(const ENCODING *enc, const char *p) | |||
601 | { | |||
602 | return LITTLE2_BYTE_TO_ASCII(enc, p)((p)[1] == 0 ? (p)[0] : -1); | |||
603 | } | |||
604 | ||||
605 | static | |||
606 | int little2_charMatches(const ENCODING *enc, const char *p, int c) | |||
607 | { | |||
608 | return LITTLE2_CHAR_MATCHES(enc, p, c)((p)[1] == 0 && (p)[0] == c); | |||
609 | } | |||
610 | ||||
611 | static | |||
612 | int little2_isNameMin(const ENCODING *enc, const char *p) | |||
613 | { | |||
614 | return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + ( ((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))); | |||
615 | } | |||
616 | ||||
617 | static | |||
618 | int little2_isNmstrtMin(const ENCODING *enc, const char *p) | |||
619 | { | |||
620 | return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) + (((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))); | |||
621 | } | |||
622 | ||||
623 | #undef VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) | |||
624 | #define VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), little2_toUtf8, little2_toUtf16 | |||
625 | ||||
626 | #else /* not XML_MIN_SIZE */ | |||
627 | ||||
628 | #undef PREFIX | |||
629 | #define PREFIX(ident) little2_ ## ident | |||
630 | #define MINBPC(enc) 2 | |||
631 | /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ | |||
632 | #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0])) | |||
633 | #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)((p)[1] == 0 ? (p)[0] : -1) | |||
634 | #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)((p)[1] == 0 && (p)[0] == c) | |||
635 | #define IS_NAME_CHAR(enc, p, n) 0 | |||
636 | #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + ( ((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) | |||
637 | #define IS_NMSTRT_CHAR(enc, p, n) (0) | |||
638 | #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) + (((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) | |||
639 | ||||
640 | #include "xmltok_impl.c" | |||
641 | ||||
642 | #undef MINBPC | |||
643 | #undef BYTE_TYPE | |||
644 | #undef BYTE_TO_ASCII | |||
645 | #undef CHAR_MATCHES | |||
646 | #undef IS_NAME_CHAR | |||
647 | #undef IS_NAME_CHAR_MINBPC | |||
648 | #undef IS_NMSTRT_CHAR | |||
649 | #undef IS_NMSTRT_CHAR_MINBPC | |||
650 | #undef IS_INVALID_CHAR | |||
651 | ||||
652 | #endif /* not XML_MIN_SIZE */ | |||
653 | ||||
654 | static const struct normal_encoding little2_encoding_ns = { | |||
655 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, | |||
656 | #if XML_BYTE_ORDER == 12 | |||
657 | 1 | |||
658 | #else | |||
659 | 0 | |||
660 | #endif | |||
661 | }, | |||
662 | { | |||
663 | #include "asciitab.h" | |||
664 | #include "latin1tab.h" | |||
665 | }, | |||
666 | STANDARD_VTABLE(little2_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
667 | }; | |||
668 | ||||
669 | static const struct normal_encoding little2_encoding = { | |||
670 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, | |||
671 | #if XML_BYTE_ORDER == 12 | |||
672 | 1 | |||
673 | #else | |||
674 | 0 | |||
675 | #endif | |||
676 | }, | |||
677 | { | |||
678 | #define BT_COLON BT_NMSTRT | |||
679 | #include "asciitab.h" | |||
680 | #undef BT_COLON | |||
681 | #include "latin1tab.h" | |||
682 | }, | |||
683 | STANDARD_VTABLE(little2_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
684 | }; | |||
685 | ||||
686 | #if XML_BYTE_ORDER != 21 | |||
687 | ||||
688 | static const struct normal_encoding internal_little2_encoding_ns = { | |||
689 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, 1 }, | |||
690 | { | |||
691 | #include "iasciitab.h" | |||
692 | #include "latin1tab.h" | |||
693 | }, | |||
694 | STANDARD_VTABLE(little2_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
695 | }; | |||
696 | ||||
697 | static const struct normal_encoding internal_little2_encoding = { | |||
698 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, 1 }, | |||
699 | { | |||
700 | #define BT_COLON BT_NMSTRT | |||
701 | #include "iasciitab.h" | |||
702 | #undef BT_COLON | |||
703 | #include "latin1tab.h" | |||
704 | }, | |||
705 | STANDARD_VTABLE(little2_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
706 | }; | |||
707 | ||||
708 | #endif | |||
709 | ||||
710 | ||||
711 | #define BIG2_BYTE_TYPE(enc, p)((p)[0] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1])) \ | |||
712 | ((p)[0] == 0 \ | |||
713 | ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ | |||
714 | : unicode_byte_type((p)[0], (p)[1])) | |||
715 | #define BIG2_BYTE_TO_ASCII(enc, p)((p)[0] == 0 ? (p)[1] : -1) ((p)[0] == 0 ? (p)[1] : -1) | |||
716 | #define BIG2_CHAR_MATCHES(enc, p, c)((p)[0] == 0 && (p)[1] == c) ((p)[0] == 0 && (p)[1] == c) | |||
717 | #define BIG2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + ( ((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) \ | |||
718 | UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + ( ((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) | |||
719 | #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) + (((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) \ | |||
720 | UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) + (((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) | |||
721 | ||||
722 | #ifdef XML_MIN_SIZE | |||
723 | ||||
724 | static | |||
725 | int big2_byteType(const ENCODING *enc, const char *p) | |||
726 | { | |||
727 | return BIG2_BYTE_TYPE(enc, p)((p)[0] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1])); | |||
728 | } | |||
729 | ||||
730 | static | |||
731 | int big2_byteToAscii(const ENCODING *enc, const char *p) | |||
732 | { | |||
733 | return BIG2_BYTE_TO_ASCII(enc, p)((p)[0] == 0 ? (p)[1] : -1); | |||
734 | } | |||
735 | ||||
736 | static | |||
737 | int big2_charMatches(const ENCODING *enc, const char *p, int c) | |||
738 | { | |||
739 | return BIG2_CHAR_MATCHES(enc, p, c)((p)[0] == 0 && (p)[1] == c); | |||
740 | } | |||
741 | ||||
742 | static | |||
743 | int big2_isNameMin(const ENCODING *enc, const char *p) | |||
744 | { | |||
745 | return BIG2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + ( ((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))); | |||
746 | } | |||
747 | ||||
748 | static | |||
749 | int big2_isNmstrtMin(const ENCODING *enc, const char *p) | |||
750 | { | |||
751 | return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) + (((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))); | |||
752 | } | |||
753 | ||||
754 | #undef VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) | |||
755 | #define VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), big2_toUtf8, big2_toUtf16 | |||
756 | ||||
757 | #else /* not XML_MIN_SIZE */ | |||
758 | ||||
759 | #undef PREFIX | |||
760 | #define PREFIX(ident) big2_ ## ident | |||
761 | #define MINBPC(enc) 2 | |||
762 | /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ | |||
763 | #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)((p)[0] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1])) | |||
764 | #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)((p)[0] == 0 ? (p)[1] : -1) | |||
765 | #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)((p)[0] == 0 && (p)[1] == c) | |||
766 | #define IS_NAME_CHAR(enc, p, n) 0 | |||
767 | #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + ( ((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) | |||
768 | #define IS_NMSTRT_CHAR(enc, p, n) (0) | |||
769 | #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) + (((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) | |||
770 | ||||
771 | #include "xmltok_impl.c" | |||
772 | ||||
773 | #undef MINBPC | |||
774 | #undef BYTE_TYPE | |||
775 | #undef BYTE_TO_ASCII | |||
776 | #undef CHAR_MATCHES | |||
777 | #undef IS_NAME_CHAR | |||
778 | #undef IS_NAME_CHAR_MINBPC | |||
779 | #undef IS_NMSTRT_CHAR | |||
780 | #undef IS_NMSTRT_CHAR_MINBPC | |||
781 | #undef IS_INVALID_CHAR | |||
782 | ||||
783 | #endif /* not XML_MIN_SIZE */ | |||
784 | ||||
785 | static const struct normal_encoding big2_encoding_ns = { | |||
786 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, | |||
787 | #if XML_BYTE_ORDER == 21 | |||
788 | 1 | |||
789 | #else | |||
790 | 0 | |||
791 | #endif | |||
792 | }, | |||
793 | { | |||
794 | #include "asciitab.h" | |||
795 | #include "latin1tab.h" | |||
796 | }, | |||
797 | STANDARD_VTABLE(big2_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
798 | }; | |||
799 | ||||
800 | static const struct normal_encoding big2_encoding = { | |||
801 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, | |||
802 | #if XML_BYTE_ORDER == 21 | |||
803 | 1 | |||
804 | #else | |||
805 | 0 | |||
806 | #endif | |||
807 | }, | |||
808 | { | |||
809 | #define BT_COLON BT_NMSTRT | |||
810 | #include "asciitab.h" | |||
811 | #undef BT_COLON | |||
812 | #include "latin1tab.h" | |||
813 | }, | |||
814 | STANDARD_VTABLE(big2_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
815 | }; | |||
816 | ||||
817 | #if XML_BYTE_ORDER != 12 | |||
818 | ||||
819 | static const struct normal_encoding internal_big2_encoding_ns = { | |||
820 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, 1 }, | |||
821 | { | |||
822 | #include "iasciitab.h" | |||
823 | #include "latin1tab.h" | |||
824 | }, | |||
825 | STANDARD_VTABLE(big2_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
826 | }; | |||
827 | ||||
828 | static const struct normal_encoding internal_big2_encoding = { | |||
829 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, 1 }, | |||
830 | { | |||
831 | #define BT_COLON BT_NMSTRT | |||
832 | #include "iasciitab.h" | |||
833 | #undef BT_COLON | |||
834 | #include "latin1tab.h" | |||
835 | }, | |||
836 | STANDARD_VTABLE(big2_) NULL_NORMAL_VTABLE((void*)0), ((void*)0), ((void*)0), ((void*)0), ((void*)0), ( (void*)0), ((void*)0), ((void*)0), ((void*)0) | |||
837 | }; | |||
838 | ||||
839 | #endif | |||
840 | ||||
841 | #undef PREFIX | |||
842 | ||||
843 | static | |||
844 | int streqci(const char *s1, const char *s2) | |||
845 | { | |||
846 | for (;;) { | |||
847 | char c1 = *s1++; | |||
848 | char c2 = *s2++; | |||
849 | if (ASCII_a0x61 <= c1 && c1 <= ASCII_z0x7A) | |||
850 | c1 += ASCII_A0x41 - ASCII_a0x61; | |||
851 | if (ASCII_a0x61 <= c2 && c2 <= ASCII_z0x7A) | |||
852 | c2 += ASCII_A0x41 - ASCII_a0x61; | |||
853 | if (c1 != c2) | |||
854 | return 0; | |||
855 | if (!c1) | |||
856 | break; | |||
857 | } | |||
858 | return 1; | |||
859 | } | |||
860 | ||||
861 | static | |||
862 | void initUpdatePosition(const ENCODING *enc ATTR_UNUSED__attribute__((__unused__)), const char *ptr, | |||
863 | const char *end, POSITION *pos) | |||
864 | { | |||
865 | normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); | |||
866 | } | |||
867 | ||||
868 | static | |||
869 | int toAscii(const ENCODING *enc, const char *ptr, const char *end) | |||
870 | { | |||
871 | char buf[1]; | |||
872 | char *p = buf; | |||
873 | XmlUtf8Convert(enc, &ptr, end, &p, p + 1)(((enc)->utf8Convert)(enc, &ptr, end, &p, p + 1)); | |||
874 | if (p == buf) | |||
875 | return -1; | |||
876 | else | |||
877 | return buf[0]; | |||
878 | } | |||
879 | ||||
880 | static | |||
881 | int isSpace(int c) | |||
882 | { | |||
883 | switch (c) { | |||
884 | case 0x20: | |||
885 | case 0xD: | |||
886 | case 0xA: | |||
887 | case 0x9: | |||
888 | return 1; | |||
889 | } | |||
890 | return 0; | |||
891 | } | |||
892 | ||||
893 | /* Return 1 if there's just optional white space | |||
894 | or there's an S followed by name=val. */ | |||
895 | static | |||
896 | int parsePseudoAttribute(const ENCODING *enc, | |||
897 | const char *ptr, | |||
898 | const char *end, | |||
899 | const char **namePtr, | |||
900 | const char **nameEndPtr, | |||
901 | const char **valPtr, | |||
902 | const char **nextTokPtr) | |||
903 | { | |||
904 | int c; | |||
905 | char open; | |||
906 | if (ptr == end) { | |||
907 | *namePtr = 0; | |||
908 | return 1; | |||
909 | } | |||
910 | if (!isSpace(toAscii(enc, ptr, end))) { | |||
911 | *nextTokPtr = ptr; | |||
912 | return 0; | |||
913 | } | |||
914 | do { | |||
915 | ptr += enc->minBytesPerChar; | |||
916 | } while (isSpace(toAscii(enc, ptr, end))); | |||
917 | if (ptr == end) { | |||
918 | *namePtr = 0; | |||
919 | return 1; | |||
920 | } | |||
921 | *namePtr = ptr; | |||
922 | for (;;) { | |||
923 | c = toAscii(enc, ptr, end); | |||
924 | if (c == -1) { | |||
925 | *nextTokPtr = ptr; | |||
926 | return 0; | |||
927 | } | |||
928 | if (c == ASCII_EQUALS0x3D) { | |||
929 | *nameEndPtr = ptr; | |||
930 | break; | |||
931 | } | |||
932 | if (isSpace(c)) { | |||
933 | *nameEndPtr = ptr; | |||
934 | do { | |||
935 | ptr += enc->minBytesPerChar; | |||
936 | } while (isSpace(c = toAscii(enc, ptr, end))); | |||
937 | if (c != ASCII_EQUALS0x3D) { | |||
938 | *nextTokPtr = ptr; | |||
939 | return 0; | |||
940 | } | |||
941 | break; | |||
942 | } | |||
943 | ptr += enc->minBytesPerChar; | |||
944 | } | |||
945 | if (ptr == *namePtr) { | |||
946 | *nextTokPtr = ptr; | |||
947 | return 0; | |||
948 | } | |||
949 | ptr += enc->minBytesPerChar; | |||
950 | c = toAscii(enc, ptr, end); | |||
951 | while (isSpace(c)) { | |||
952 | ptr += enc->minBytesPerChar; | |||
953 | c = toAscii(enc, ptr, end); | |||
954 | } | |||
955 | if (c != ASCII_QUOT0x22 && c != ASCII_APOS0x27) { | |||
956 | *nextTokPtr = ptr; | |||
957 | return 0; | |||
958 | } | |||
959 | open = c; | |||
960 | ptr += enc->minBytesPerChar; | |||
961 | *valPtr = ptr; | |||
962 | for (;; ptr += enc->minBytesPerChar) { | |||
963 | c = toAscii(enc, ptr, end); | |||
964 | if (c == open) | |||
965 | break; | |||
966 | if (!(ASCII_a0x61 <= c && c <= ASCII_z0x7A) | |||
967 | && !(ASCII_A0x41 <= c && c <= ASCII_Z0x5A) | |||
968 | && !(ASCII_00x30 <= c && c <= ASCII_90x39) | |||
969 | && c != ASCII_PERIOD0x2E | |||
970 | && c != ASCII_MINUS0x2D | |||
971 | && c != ASCII_UNDERSCORE0x5F) { | |||
972 | *nextTokPtr = ptr; | |||
973 | return 0; | |||
974 | } | |||
975 | } | |||
976 | *nextTokPtr = ptr + enc->minBytesPerChar; | |||
977 | return 1; | |||
978 | } | |||
979 | ||||
980 | static const char KW_version[] = { | |||
981 | ASCII_v0x76, ASCII_e0x65, ASCII_r0x72, ASCII_s0x73, ASCII_i0x69, ASCII_o0x6F, ASCII_n0x6E, '\0' | |||
982 | }; | |||
983 | ||||
984 | static const char KW_encoding[] = { | |||
985 | ASCII_e0x65, ASCII_n0x6E, ASCII_c0x63, ASCII_o0x6F, ASCII_d0x64, ASCII_i0x69, ASCII_n0x6E, ASCII_g0x67, '\0' | |||
986 | }; | |||
987 | ||||
988 | static const char KW_standalone[] = { | |||
989 | ASCII_s0x73, ASCII_t0x74, ASCII_a0x61, ASCII_n0x6E, ASCII_d0x64, ASCII_a0x61, ASCII_l0x6C, ASCII_o0x6F, ASCII_n0x6E, ASCII_e0x65, '\0' | |||
990 | }; | |||
991 | ||||
992 | static const char KW_yes[] = { | |||
993 | ASCII_y0x79, ASCII_e0x65, ASCII_s0x73, '\0' | |||
994 | }; | |||
995 | ||||
996 | static const char KW_no[] = { | |||
997 | ASCII_n0x6E, ASCII_o0x6F, '\0' | |||
998 | }; | |||
999 | ||||
1000 | static | |||
1001 | int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, | |||
1002 | const char *, | |||
1003 | const char *), | |||
1004 | int isGeneralTextEntity, | |||
1005 | const ENCODING *enc, | |||
1006 | const char *ptr, | |||
1007 | const char *end, | |||
1008 | const char **badPtr, | |||
1009 | const char **versionPtr, | |||
1010 | const char **encodingName, | |||
1011 | const ENCODING **encoding, | |||
1012 | int *standalone) | |||
1013 | { | |||
1014 | const char *val = 0; | |||
1015 | const char *name = 0; | |||
1016 | const char *nameEnd = 0; | |||
1017 | ptr += 5 * enc->minBytesPerChar; | |||
1018 | end -= 2 * enc->minBytesPerChar; | |||
1019 | if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name) { | |||
1020 | *badPtr = ptr; | |||
1021 | return 0; | |||
1022 | } | |||
1023 | if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)(((enc)->nameMatchesAscii)(enc, name, nameEnd, KW_version) )) { | |||
1024 | if (!isGeneralTextEntity) { | |||
1025 | *badPtr = name; | |||
1026 | return 0; | |||
1027 | } | |||
1028 | } | |||
1029 | else { | |||
1030 | if (versionPtr) | |||
1031 | *versionPtr = val; | |||
1032 | if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { | |||
1033 | *badPtr = ptr; | |||
1034 | return 0; | |||
1035 | } | |||
1036 | if (!name) { | |||
1037 | if (isGeneralTextEntity) { | |||
1038 | /* a TextDecl must have an EncodingDecl */ | |||
1039 | *badPtr = ptr; | |||
1040 | return 0; | |||
1041 | } | |||
1042 | return 1; | |||
1043 | } | |||
1044 | } | |||
1045 | if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)(((enc)->nameMatchesAscii)(enc, name, nameEnd, KW_encoding ))) { | |||
1046 | int c = toAscii(enc, val, end); | |||
1047 | if (!(ASCII_a0x61 <= c && c <= ASCII_z0x7A) && !(ASCII_A0x41 <= c && c <= ASCII_Z0x5A)) { | |||
1048 | *badPtr = val; | |||
1049 | return 0; | |||
1050 | } | |||
1051 | if (encodingName) | |||
1052 | *encodingName = val; | |||
1053 | if (encoding) | |||
1054 | *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); | |||
1055 | if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { | |||
1056 | *badPtr = ptr; | |||
1057 | return 0; | |||
1058 | } | |||
1059 | if (!name) | |||
1060 | return 1; | |||
1061 | } | |||
1062 | if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)(((enc)->nameMatchesAscii)(enc, name, nameEnd, KW_standalone )) || isGeneralTextEntity) { | |||
1063 | *badPtr = name; | |||
1064 | return 0; | |||
1065 | } | |||
1066 | if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)(((enc)->nameMatchesAscii)(enc, val, ptr - enc->minBytesPerChar , KW_yes))) { | |||
1067 | if (standalone) | |||
1068 | *standalone = 1; | |||
1069 | } | |||
1070 | else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)(((enc)->nameMatchesAscii)(enc, val, ptr - enc->minBytesPerChar , KW_no))) { | |||
1071 | if (standalone) | |||
1072 | *standalone = 0; | |||
1073 | } | |||
1074 | else { | |||
1075 | *badPtr = val; | |||
1076 | return 0; | |||
1077 | } | |||
1078 | while (isSpace(toAscii(enc, ptr, end))) | |||
1079 | ptr += enc->minBytesPerChar; | |||
1080 | if (ptr != end) { | |||
1081 | *badPtr = ptr; | |||
1082 | return 0; | |||
1083 | } | |||
1084 | return 1; | |||
1085 | } | |||
1086 | ||||
1087 | static | |||
1088 | int checkCharRefNumber(int result) | |||
1089 | { | |||
1090 | switch (result >> 8) { | |||
1091 | case 0xD8: case 0xD9: case 0xDA: case 0xDB: | |||
1092 | case 0xDC: case 0xDD: case 0xDE: case 0xDF: | |||
1093 | return -1; | |||
1094 | case 0: | |||
1095 | if (latin1_encoding.type[result] == BT_NONXML) | |||
1096 | return -1; | |||
1097 | break; | |||
1098 | case 0xFF: | |||
1099 | if (result == 0xFFFE || result == 0xFFFF) | |||
1100 | return -1; | |||
1101 | break; | |||
1102 | } | |||
1103 | return result; | |||
1104 | } | |||
1105 | ||||
1106 | ||||
1107 | ||||
1108 | int | |||
1109 | xmlrpc_XmlUtf8Encode(int const c, | |||
1110 | char * const buf) { | |||
1111 | ||||
1112 | enum { | |||
1113 | /* minN is minimum legal resulting value for N byte sequence */ | |||
1114 | min2 = 0x80, | |||
1115 | min3 = 0x800, | |||
1116 | min4 = 0x10000 | |||
1117 | }; | |||
1118 | ||||
1119 | if (c < 0) | |||
1120 | return 0; | |||
1121 | if (c < min2) { | |||
1122 | buf[0] = (c | UTF8_cval1); | |||
1123 | return 1; | |||
1124 | } | |||
1125 | if (c < min3) { | |||
1126 | buf[0] = ((c >> 6) | UTF8_cval2); | |||
1127 | buf[1] = ((c & 0x3f) | 0x80); | |||
1128 | return 2; | |||
1129 | } | |||
1130 | if (c < min4) { | |||
1131 | buf[0] = ((c >> 12) | UTF8_cval3); | |||
1132 | buf[1] = (((c >> 6) & 0x3f) | 0x80); | |||
1133 | buf[2] = ((c & 0x3f) | 0x80); | |||
1134 | return 3; | |||
1135 | } | |||
1136 | if (c < 0x110000) { | |||
1137 | buf[0] = ((c >> 18) | UTF8_cval4); | |||
1138 | buf[1] = (((c >> 12) & 0x3f) | 0x80); | |||
1139 | buf[2] = (((c >> 6) & 0x3f) | 0x80); | |||
1140 | buf[3] = ((c & 0x3f) | 0x80); | |||
1141 | return 4; | |||
1142 | } | |||
1143 | return 0; | |||
1144 | } | |||
1145 | ||||
1146 | ||||
1147 | ||||
1148 | int | |||
1149 | xmlrpc_XmlUtf16Encode(int const charNumArg, | |||
1150 | unsigned short * const buf) { | |||
1151 | ||||
1152 | int charNum; | |||
1153 | ||||
1154 | charNum = charNumArg; /* initial value */ | |||
1155 | ||||
1156 | if (charNum < 0) | |||
1157 | return 0; | |||
1158 | if (charNum < 0x10000) { | |||
1159 | buf[0] = charNum; | |||
1160 | return 1; | |||
1161 | } | |||
1162 | if (charNum < 0x110000) { | |||
1163 | charNum -= 0x10000; | |||
1164 | buf[0] = (charNum >> 10) + 0xD800; | |||
1165 | buf[1] = (charNum & 0x3FF) + 0xDC00; | |||
1166 | return 2; | |||
1167 | } | |||
1168 | return 0; | |||
1169 | } | |||
1170 | ||||
1171 | ||||
1172 | ||||
1173 | struct unknown_encoding { | |||
1174 | struct normal_encoding normal; | |||
1175 | int (*convert)(void *userData, const char *p); | |||
1176 | void *userData; | |||
1177 | unsigned short utf16[256]; | |||
1178 | char utf8[256][4]; | |||
1179 | }; | |||
1180 | ||||
1181 | ||||
1182 | ||||
1183 | int | |||
1184 | xmlrpc_XmlSizeOfUnknownEncoding(void) { | |||
1185 | ||||
1186 | return sizeof(struct unknown_encoding); | |||
1187 | } | |||
1188 | ||||
1189 | ||||
1190 | ||||
1191 | static | |||
1192 | int unknown_isName(const ENCODING *enc, const char *p) | |||
1193 | { | |||
1194 | int c = ((const struct unknown_encoding *)enc) | |||
1195 | ->convert(((const struct unknown_encoding *)enc)->userData, p); | |||
1196 | if (c & ~0xFFFF) | |||
1197 | return 0; | |||
1198 | return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF)(namingBitmap[(namePages[c >> 8] << 3) + ((c & 0xFF) >> 5)] & (1 << ((c & 0xFF) & 0x1F ))); | |||
1199 | } | |||
1200 | ||||
1201 | static | |||
1202 | int unknown_isNmstrt(const ENCODING *enc, const char *p) | |||
1203 | { | |||
1204 | int c = ((const struct unknown_encoding *)enc) | |||
1205 | ->convert(((const struct unknown_encoding *)enc)->userData, p); | |||
1206 | if (c & ~0xFFFF) | |||
1207 | return 0; | |||
1208 | return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF)(namingBitmap[(nmstrtPages[c >> 8] << 3) + ((c & 0xFF) >> 5)] & (1 << ((c & 0xFF) & 0x1F ))); | |||
1209 | } | |||
1210 | ||||
1211 | static | |||
1212 | int unknown_isInvalid(const ENCODING *enc, const char *p) | |||
1213 | { | |||
1214 | int c = ((const struct unknown_encoding *)enc) | |||
1215 | ->convert(((const struct unknown_encoding *)enc)->userData, p); | |||
1216 | return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; | |||
1217 | } | |||
1218 | ||||
1219 | static | |||
1220 | void unknown_toUtf8(const ENCODING *enc, | |||
1221 | const char **fromP, const char *fromLim, | |||
1222 | char **toP, const char *toLim) | |||
1223 | { | |||
1224 | char buf[XML_UTF8_ENCODE_MAX4]; | |||
1225 | for (;;) { | |||
| ||||
1226 | const char *utf8; | |||
1227 | int n; | |||
1228 | if (*fromP == fromLim) | |||
1229 | break; | |||
1230 | utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP]; | |||
1231 | n = *utf8++; | |||
1232 | if (n == 0) { | |||
1233 | int c = ((const struct unknown_encoding *)enc) | |||
1234 | ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); | |||
1235 | n = xmlrpc_XmlUtf8Encode(c, buf); | |||
1236 | if (n > toLim - *toP) | |||
1237 | break; | |||
1238 | utf8 = buf; | |||
1239 | *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] | |||
1240 | - (BT_LEAD2 - 2); | |||
1241 | } | |||
1242 | else { | |||
1243 | if (n > toLim - *toP) | |||
1244 | break; | |||
1245 | (*fromP)++; | |||
1246 | } | |||
1247 | do { | |||
1248 | *(*toP)++ = *utf8++; | |||
| ||||
1249 | } while (--n != 0); | |||
1250 | } | |||
1251 | } | |||
1252 | ||||
1253 | static | |||
1254 | void unknown_toUtf16(const ENCODING *enc, | |||
1255 | const char **fromP, const char *fromLim, | |||
1256 | unsigned short **toP, const unsigned short *toLim) | |||
1257 | { | |||
1258 | while (*fromP != fromLim && *toP != toLim) { | |||
1259 | unsigned short c | |||
1260 | = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP]; | |||
1261 | if (c == 0) { | |||
1262 | c = (unsigned short)((const struct unknown_encoding *)enc) | |||
1263 | ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); | |||
1264 | *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] | |||
1265 | - (BT_LEAD2 - 2); | |||
1266 | } | |||
1267 | else | |||
1268 | (*fromP)++; | |||
1269 | *(*toP)++ = c; | |||
1270 | } | |||
1271 | } | |||
1272 | ||||
1273 | ENCODING * | |||
1274 | xmlrpc_XmlInitUnknownEncoding(void * const mem, | |||
1275 | int * const table, | |||
1276 | int (*convert)(void *userData, const char *p), | |||
1277 | void * const userData) { | |||
1278 | ||||
1279 | int i; | |||
1280 | struct unknown_encoding *e = mem; | |||
1281 | for (i = 0; i < (int)sizeof(struct normal_encoding); i++) | |||
1282 | ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; | |||
1283 | for (i = 0; i < 128; i++) | |||
1284 | if (latin1_encoding.type[i] != BT_OTHER | |||
1285 | && latin1_encoding.type[i] != BT_NONXML | |||
1286 | && table[i] != i) | |||
1287 | return 0; | |||
1288 | for (i = 0; i < 256; i++) { | |||
1289 | int c = table[i]; | |||
1290 | if (c == -1) { | |||
1291 | e->normal.type[i] = BT_MALFORM; | |||
1292 | /* This shouldn't really get used. */ | |||
1293 | e->utf16[i] = 0xFFFF; | |||
1294 | e->utf8[i][0] = 1; | |||
1295 | e->utf8[i][1] = 0; | |||
1296 | } | |||
1297 | else if (c < 0) { | |||
1298 | if (c < -4) | |||
1299 | return 0; | |||
1300 | e->normal.type[i] = BT_LEAD2 - (c + 2); | |||
1301 | e->utf8[i][0] = 0; | |||
1302 | e->utf16[i] = 0; | |||
1303 | } | |||
1304 | else if (c < 0x80) { | |||
1305 | if (latin1_encoding.type[c] != BT_OTHER | |||
1306 | && latin1_encoding.type[c] != BT_NONXML | |||
1307 | && c != i) | |||
1308 | return 0; | |||
1309 | e->normal.type[i] = latin1_encoding.type[c]; | |||
1310 | e->utf8[i][0] = 1; | |||
1311 | e->utf8[i][1] = (char)c; | |||
1312 | e->utf16[i] = c == 0 ? 0xFFFF : c; | |||
1313 | } | |||
1314 | else if (checkCharRefNumber(c) < 0) { | |||
1315 | e->normal.type[i] = BT_NONXML; | |||
1316 | /* This shouldn't really get used. */ | |||
1317 | e->utf16[i] = 0xFFFF; | |||
1318 | e->utf8[i][0] = 1; | |||
1319 | e->utf8[i][1] = 0; | |||
1320 | } | |||
1321 | else { | |||
1322 | if (c > 0xFFFF) | |||
1323 | return 0; | |||
1324 | if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)(namingBitmap[(nmstrtPages[c >> 8] << 3) + ((c & 0xff) >> 5)] & (1 << ((c & 0xff) & 0x1F )))) | |||
1325 | e->normal.type[i] = BT_NMSTRT; | |||
1326 | else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)(namingBitmap[(namePages[c >> 8] << 3) + ((c & 0xff) >> 5)] & (1 << ((c & 0xff) & 0x1F )))) | |||
1327 | e->normal.type[i] = BT_NAME; | |||
1328 | else | |||
1329 | e->normal.type[i] = BT_OTHER; | |||
1330 | e->utf8[i][0] = (char)xmlrpc_XmlUtf8Encode(c, e->utf8[i] + 1); | |||
1331 | e->utf16[i] = c; | |||
1332 | } | |||
1333 | } | |||
1334 | e->userData = userData; | |||
1335 | e->convert = convert; | |||
1336 | if (convert) { | |||
1337 | e->normal.isName2 = unknown_isName; | |||
1338 | e->normal.isName3 = unknown_isName; | |||
1339 | e->normal.isName4 = unknown_isName; | |||
1340 | e->normal.isNmstrt2 = unknown_isNmstrt; | |||
1341 | e->normal.isNmstrt3 = unknown_isNmstrt; | |||
1342 | e->normal.isNmstrt4 = unknown_isNmstrt; | |||
1343 | e->normal.isInvalid2 = unknown_isInvalid; | |||
1344 | e->normal.isInvalid3 = unknown_isInvalid; | |||
1345 | e->normal.isInvalid4 = unknown_isInvalid; | |||
1346 | } | |||
1347 | e->normal.enc.utf8Convert = unknown_toUtf8; | |||
1348 | e->normal.enc.utf16Convert = unknown_toUtf16; | |||
1349 | return &(e->normal.enc); | |||
1350 | } | |||
1351 | ||||
1352 | /* If this enumeration is changed, getEncodingIndex and encodings | |||
1353 | must also be changed. */ | |||
1354 | enum { | |||
1355 | UNKNOWN_ENC = -1, | |||
1356 | ISO_8859_1_ENC = 0, | |||
1357 | US_ASCII_ENC, | |||
1358 | UTF_8_ENC, | |||
1359 | UTF_16_ENC, | |||
1360 | UTF_16BE_ENC, | |||
1361 | UTF_16LE_ENC, | |||
1362 | /* must match encodingNames up to here */ | |||
1363 | NO_ENC | |||
1364 | }; | |||
1365 | ||||
1366 | static const char KW_ISO_8859_1[] = { | |||
1367 | ASCII_I0x49, ASCII_S0x53, ASCII_O0x4F, ASCII_MINUS0x2D, ASCII_80x38, ASCII_80x38, ASCII_50x35, ASCII_90x39, ASCII_MINUS0x2D, ASCII_10x31, '\0' | |||
1368 | }; | |||
1369 | static const char KW_US_ASCII[] = { | |||
1370 | ASCII_U0x55, ASCII_S0x53, ASCII_MINUS0x2D, ASCII_A0x41, ASCII_S0x53, ASCII_C0x43, ASCII_I0x49, ASCII_I0x49, '\0' | |||
1371 | }; | |||
1372 | static const char KW_UTF_8[] = { | |||
1373 | ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_80x38, '\0' | |||
1374 | }; | |||
1375 | static const char KW_UTF_16[] = { | |||
1376 | ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_10x31, ASCII_60x36, '\0' | |||
1377 | }; | |||
1378 | static const char KW_UTF_16BE[] = { | |||
1379 | ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_10x31, ASCII_60x36, ASCII_B0x42, ASCII_E0x45, '\0' | |||
1380 | }; | |||
1381 | static const char KW_UTF_16LE[] = { | |||
1382 | ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_10x31, ASCII_60x36, ASCII_L0x4C, ASCII_E0x45, '\0' | |||
1383 | }; | |||
1384 | ||||
1385 | static | |||
1386 | int getEncodingIndex(const char *name) | |||
1387 | { | |||
1388 | static const char *encodingNames[] = { | |||
1389 | KW_ISO_8859_1, | |||
1390 | KW_US_ASCII, | |||
1391 | KW_UTF_8, | |||
1392 | KW_UTF_16, | |||
1393 | KW_UTF_16BE, | |||
1394 | KW_UTF_16LE, | |||
1395 | }; | |||
1396 | int i; | |||
1397 | if (name == 0) | |||
1398 | return NO_ENC; | |||
1399 | for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) | |||
1400 | if (streqci(name, encodingNames[i])) | |||
1401 | return i; | |||
1402 | return UNKNOWN_ENC; | |||
1403 | } | |||
1404 | ||||
1405 | /* For binary compatibility, we store the index of the encoding specified | |||
1406 | at initialization in the isUtf16 member. */ | |||
1407 | ||||
1408 | #define INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) ((int)(enc)->initEnc.isUtf16) | |||
1409 | #define SET_INIT_ENC_INDEX(enc, i)((enc)->initEnc.isUtf16 = (char)i) ((enc)->initEnc.isUtf16 = (char)i) | |||
1410 | ||||
1411 | /* This is what detects the encoding. | |||
1412 | encodingTable maps from encoding indices to encodings; | |||
1413 | INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding; | |||
1414 | state is XML_CONTENT_STATE if we're parsing an external text entity, | |||
1415 | and XML_PROLOG_STATE otherwise. | |||
1416 | */ | |||
1417 | ||||
1418 | ||||
1419 | static | |||
1420 | int initScan(const ENCODING **encodingTable, | |||
1421 | const INIT_ENCODING *enc, | |||
1422 | int state, | |||
1423 | const char *ptr, | |||
1424 | const char *end, | |||
1425 | const char **nextTokPtr) | |||
1426 | { | |||
1427 | const ENCODING **encPtr; | |||
1428 | ||||
1429 | if (ptr == end) | |||
1430 | return XML_TOK_NONE-4; | |||
1431 | encPtr = enc->encPtr; | |||
1432 | if (ptr + 1 == end) { | |||
1433 | /* only a single byte available for auto-detection */ | |||
1434 | /* so we're parsing an external text entity... */ | |||
1435 | /* if UTF-16 was externally specified, then we need at least 2 bytes */ | |||
1436 | switch (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16)) { | |||
1437 | case UTF_16_ENC: | |||
1438 | case UTF_16LE_ENC: | |||
1439 | case UTF_16BE_ENC: | |||
1440 | return XML_TOK_PARTIAL-1; | |||
1441 | } | |||
1442 | switch ((unsigned char)*ptr) { | |||
1443 | case 0xFE: | |||
1444 | case 0xFF: | |||
1445 | case 0xEF: /* possibly first byte of UTF-8 BOM */ | |||
1446 | if (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == ISO_8859_1_ENC | |||
1447 | && state == XML_CONTENT_STATE1) | |||
1448 | break; | |||
1449 | /* fall through */ | |||
1450 | case 0x00: | |||
1451 | case 0x3C: | |||
1452 | return XML_TOK_PARTIAL-1; | |||
1453 | } | |||
1454 | } | |||
1455 | else { | |||
1456 | switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { | |||
1457 | case 0xFEFF: | |||
1458 | if (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == ISO_8859_1_ENC | |||
1459 | && state == XML_CONTENT_STATE1) | |||
1460 | break; | |||
1461 | *nextTokPtr = ptr + 2; | |||
1462 | *encPtr = encodingTable[UTF_16BE_ENC]; | |||
1463 | return XML_TOK_BOM14; | |||
1464 | /* 00 3C is handled in the default case */ | |||
1465 | case 0x3C00: | |||
1466 | if ((INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == UTF_16BE_ENC | |||
1467 | || INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == UTF_16_ENC) | |||
1468 | && state == XML_CONTENT_STATE1) | |||
1469 | break; | |||
1470 | *encPtr = encodingTable[UTF_16LE_ENC]; | |||
1471 | return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr )); | |||
1472 | case 0xFFFE: | |||
1473 | if (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == ISO_8859_1_ENC | |||
1474 | && state == XML_CONTENT_STATE1) | |||
1475 | break; | |||
1476 | *nextTokPtr = ptr + 2; | |||
1477 | *encPtr = encodingTable[UTF_16LE_ENC]; | |||
1478 | return XML_TOK_BOM14; | |||
1479 | case 0xEFBB: | |||
1480 | /* Maybe a UTF-8 BOM (EF BB BF) */ | |||
1481 | /* If there's an explicitly specified (external) encoding | |||
1482 | of ISO-8859-1 or some flavour of UTF-16 | |||
1483 | and this is an external text entity, | |||
1484 | don't look for the BOM, | |||
1485 | because it might be a legal data. */ | |||
1486 | if (state == XML_CONTENT_STATE1) { | |||
1487 | int e = INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16); | |||
1488 | if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC) | |||
1489 | break; | |||
1490 | } | |||
1491 | if (ptr + 2 == end) | |||
1492 | return XML_TOK_PARTIAL-1; | |||
1493 | if ((unsigned char)ptr[2] == 0xBF) { | |||
1494 | *encPtr = encodingTable[UTF_8_ENC]; | |||
1495 | return XML_TOK_BOM14; | |||
1496 | } | |||
1497 | break; | |||
1498 | default: | |||
1499 | if (ptr[0] == '\0') { | |||
1500 | /* 0 isn't a legal data character. Furthermore a document entity can only | |||
1501 | start with ASCII characters. So the only way this can fail to be big-endian | |||
1502 | UTF-16 if it it's an external parsed general entity that's labelled as | |||
1503 | UTF-16LE. */ | |||
1504 | if (state == XML_CONTENT_STATE1 && INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == UTF_16LE_ENC) | |||
1505 | break; | |||
1506 | *encPtr = encodingTable[UTF_16BE_ENC]; | |||
1507 | return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr )); | |||
1508 | } | |||
1509 | else if (ptr[1] == '\0') { | |||
1510 | /* We could recover here in the case: | |||
1511 | - parsing an external entity | |||
1512 | - second byte is 0 | |||
1513 | - no externally specified encoding | |||
1514 | - no encoding declaration | |||
1515 | by assuming UTF-16LE. But we don't, because this would mean when | |||
1516 | presented just with a single byte, we couldn't reliably determine | |||
1517 | whether we needed further bytes. */ | |||
1518 | if (state == XML_CONTENT_STATE1) | |||
1519 | break; | |||
1520 | *encPtr = encodingTable[UTF_16LE_ENC]; | |||
1521 | return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr )); | |||
1522 | } | |||
1523 | break; | |||
1524 | } | |||
1525 | } | |||
1526 | *encPtr = encodingTable[INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16)]; | |||
1527 | return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr )); | |||
1528 | } | |||
1529 | ||||
1530 | ||||
1531 | #define NS(x) x | |||
1532 | #define ns(x) x | |||
1533 | #include "xmltok_ns.c" | |||
1534 | #undef NS | |||
1535 | #undef ns | |||
1536 | ||||
1537 | #define NS(x) x ## NS | |||
1538 | #define ns(x) x ## _ns | |||
1539 | ||||
1540 | #include "xmltok_ns.c" | |||
1541 | ||||
1542 | #undef NS | |||
1543 | #undef ns | |||
1544 | ||||
1545 | ENCODING * | |||
1546 | xmlrpc_XmlInitUnknownEncodingNS(void * const mem, | |||
1547 | int * const table, | |||
1548 | int (*convert)(void *userData, const char *p), | |||
1549 | void * const userData) { | |||
1550 | ||||
1551 | ENCODING * const enc = | |||
1552 | xmlrpc_XmlInitUnknownEncoding(mem, table, convert, userData); | |||
1553 | if (enc) | |||
1554 | ((struct normal_encoding *)enc)->type[ASCII_COLON0x3A] = BT_COLON; | |||
1555 | return enc; | |||
1556 | } |