Bug Summary

File:libs/apr-util/xml/expat/lib/xmltok.c
Location:line 1252, column 17
Description:Assigned value is garbage or undefined

Annotated Source Code

1/*
2Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3See the file COPYING for copying permission.
4*/
5
6static char RCSId[]
7 = "$Header: /home/cvs/apr-util/xml/expat/lib/xmltok.c,v 1.1 2001/02/28 14:41:26 gstein Exp $";
8
9#ifdef COMPILED_FROM_DSP
10# include "winconfig.h"
11#else
12# include <config.h>
13#endif /* ndef COMPILED_FROM_DSP */
14
15#include "xmltok.h"
16#include "nametab.h"
17
18#ifdef XML_DTD
19#define IGNORE_SECTION_TOK_VTABLE, PREFIX(ignoreSectionTok) , PREFIX(ignoreSectionTok)
20#else
21#define IGNORE_SECTION_TOK_VTABLE, PREFIX(ignoreSectionTok) /* as nothing */
22#endif
23
24#define VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
\
25 { PREFIX(prologTok), PREFIX(contentTok), \
26 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE, PREFIX(ignoreSectionTok) }, \
27 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
28 PREFIX(sameName), \
29 PREFIX(nameMatchesAscii), \
30 PREFIX(nameLength), \
31 PREFIX(skipS), \
32 PREFIX(getAtts), \
33 PREFIX(charRefNumber), \
34 PREFIX(predefinedEntityName), \
35 PREFIX(updatePosition), \
36 PREFIX(isPublicId)
37
38#define VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, PREFIX(toUtf8), PREFIX(toUtf16)
39
40#define UCS2_GET_NAMING(pages, hi, lo)(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] &
(1 << ((lo) & 0x1F)))
\
41 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
42
43/* A 2 byte UTF-8 representation splits the characters 11 bits
44between the bottom 5 and 6 bits of the bytes.
45We need 8 bits to index into pages, 3 bits to add to that index and
465 bits to generate the mask. */
47#define UTF8_GET_NAMING2(pages, byte)(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] <<
3) + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >>
5) & 1)] & (1 << (((byte)[1]) & 0x1F)))
\
48 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
49 + ((((byte)[0]) & 3) << 1) \
50 + ((((byte)[1]) >> 5) & 1)] \
51 & (1 << (((byte)[1]) & 0x1F)))
52
53/* A 3 byte UTF-8 representation splits the characters 16 bits
54between the bottom 4, 6 and 6 bits of the bytes.
55We need 8 bits to index into pages, 3 bits to add to that index and
565 bits to generate the mask. */
57#define UTF8_GET_NAMING3(pages, byte)(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) +
((((byte)[1]) >> 2) & 0xF)] << 3) + ((((byte
)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1
)] & (1 << (((byte)[2]) & 0x1F)))
\
58 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
59 + ((((byte)[1]) >> 2) & 0xF)] \
60 << 3) \
61 + ((((byte)[1]) & 3) << 1) \
62 + ((((byte)[2]) >> 5) & 1)] \
63 & (1 << (((byte)[2]) & 0x1F)))
64
65#define UTF8_GET_NAMING(pages, p, n)((n) == 2 ? (namingBitmap[((pages)[((((const unsigned char *)
(p))[0]) >> 2) & 7] << 3) + (((((const unsigned
char *)(p))[0]) & 3) << 1) + (((((const unsigned char
*)(p))[1]) >> 5) & 1)] & (1 << ((((const
unsigned char *)(p))[1]) & 0x1F))) : ((n) == 3 ? (namingBitmap
[((pages)[(((((const unsigned char *)(p))[0]) & 0xF) <<
4) + (((((const unsigned char *)(p))[1]) >> 2) & 0xF
)] << 3) + (((((const unsigned char *)(p))[1]) & 3)
<< 1) + (((((const unsigned char *)(p))[2]) >> 5
) & 1)] & (1 << ((((const unsigned char *)(p))[
2]) & 0x1F))) : 0))
\
66 ((n) == 2 \
67 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))(namingBitmap[((pages)[((((const unsigned char *)(p))[0]) >>
2) & 7] << 3) + (((((const unsigned char *)(p))[0]
) & 3) << 1) + (((((const unsigned char *)(p))[1]) >>
5) & 1)] & (1 << ((((const unsigned char *)(p)
)[1]) & 0x1F)))
\
68 : ((n) == 3 \
69 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p))(namingBitmap[((pages)[(((((const unsigned char *)(p))[0]) &
0xF) << 4) + (((((const unsigned char *)(p))[1]) >>
2) & 0xF)] << 3) + (((((const unsigned char *)(p))
[1]) & 3) << 1) + (((((const unsigned char *)(p))[2
]) >> 5) & 1)] & (1 << ((((const unsigned
char *)(p))[2]) & 0x1F)))
\
70 : 0))
71
72#define UTF8_INVALID3(p)((*p) == 0xED ? (((p)[1] & 0x20) != 0) : ((*p) == 0xEF ? (
(p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE))
: 0))
\
73 ((*p) == 0xED \
74 ? (((p)[1] & 0x20) != 0) \
75 : ((*p) == 0xEF \
76 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
77 : 0))
78
79#define UTF8_INVALID4(p)((*p) == 0xF4 && ((p)[1] & 0x30) != 0) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
80
81static
82int isNever(const ENCODING *enc, const char *p)
83{
84 return 0;
85}
86
87static
88int utf8_isName2(const ENCODING *enc, const char *p)
89{
90 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p)(namingBitmap[((namePages)[((((const unsigned char *)p)[0]) >>
2) & 7] << 3) + (((((const unsigned char *)p)[0]) &
3) << 1) + (((((const unsigned char *)p)[1]) >> 5
) & 1)] & (1 << ((((const unsigned char *)p)[1]
) & 0x1F)))
;
91}
92
93static
94int utf8_isName3(const ENCODING *enc, const char *p)
95{
96 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p)(namingBitmap[((namePages)[(((((const unsigned char *)p)[0]) &
0xF) << 4) + (((((const unsigned char *)p)[1]) >>
2) & 0xF)] << 3) + (((((const unsigned char *)p)[1
]) & 3) << 1) + (((((const unsigned char *)p)[2]) >>
5) & 1)] & (1 << ((((const unsigned char *)p)[
2]) & 0x1F)))
;
97}
98
99#define utf8_isName4isNever isNever
100
101static
102int utf8_isNmstrt2(const ENCODING *enc, const char *p)
103{
104 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p)(namingBitmap[((nmstrtPages)[((((const unsigned char *)p)[0])
>> 2) & 7] << 3) + (((((const unsigned char *
)p)[0]) & 3) << 1) + (((((const unsigned char *)p)[
1]) >> 5) & 1)] & (1 << ((((const unsigned
char *)p)[1]) & 0x1F)))
;
105}
106
107static
108int utf8_isNmstrt3(const ENCODING *enc, const char *p)
109{
110 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p)(namingBitmap[((nmstrtPages)[(((((const unsigned char *)p)[0]
) & 0xF) << 4) + (((((const unsigned char *)p)[1]) >>
2) & 0xF)] << 3) + (((((const unsigned char *)p)[1
]) & 3) << 1) + (((((const unsigned char *)p)[2]) >>
5) & 1)] & (1 << ((((const unsigned char *)p)[
2]) & 0x1F)))
;
111}
112
113#define utf8_isNmstrt4isNever isNever
114
115#define utf8_isInvalid2isNever isNever
116
117static
118int utf8_isInvalid3(const ENCODING *enc, const char *p)
119{
120 return UTF8_INVALID3((const unsigned char *)p)((*(const unsigned char *)p) == 0xED ? ((((const unsigned char
*)p)[1] & 0x20) != 0) : ((*(const unsigned char *)p) == 0xEF
? (((const unsigned char *)p)[1] == 0xBF && (((const
unsigned char *)p)[2] == 0xBF || ((const unsigned char *)p)[
2] == 0xBE)) : 0))
;
121}
122
123static
124int utf8_isInvalid4(const ENCODING *enc, const char *p)
125{
126 return UTF8_INVALID4((const unsigned char *)p)((*(const unsigned char *)p) == 0xF4 && (((const unsigned
char *)p)[1] & 0x30) != 0)
;
127}
128
129struct normal_encoding {
130 ENCODING enc;
131 unsigned char type[256];
132#ifdef XML_MIN_SIZE
133 int (*byteType)(const ENCODING *, const char *);
134 int (*isNameMin)(const ENCODING *, const char *);
135 int (*isNmstrtMin)(const ENCODING *, const char *);
136 int (*byteToAscii)(const ENCODING *, const char *);
137 int (*charMatches)(const ENCODING *, const char *, int);
138#endif /* XML_MIN_SIZE */
139 int (*isName2)(const ENCODING *, const char *);
140 int (*isName3)(const ENCODING *, const char *);
141 int (*isName4)(const ENCODING *, const char *);
142 int (*isNmstrt2)(const ENCODING *, const char *);
143 int (*isNmstrt3)(const ENCODING *, const char *);
144 int (*isNmstrt4)(const ENCODING *, const char *);
145 int (*isInvalid2)(const ENCODING *, const char *);
146 int (*isInvalid3)(const ENCODING *, const char *);
147 int (*isInvalid4)(const ENCODING *, const char *);
148};
149
150#ifdef XML_MIN_SIZE
151
152#define STANDARD_VTABLE(E) \
153 E ## byteType, \
154 E ## isNameMin, \
155 E ## isNmstrtMin, \
156 E ## byteToAscii, \
157 E ## charMatches,
158
159#else
160
161#define STANDARD_VTABLE(E) /* as nothing */
162
163#endif
164
165#define NORMAL_VTABLE(E)EisName2, EisName3, EisName4, EisNmstrt2, EisNmstrt3, EisNmstrt4
, EisInvalid2, EisInvalid3, EisInvalid4
\
166 E ## isName2, \
167 E ## isName3, \
168 E ## isName4, \
169 E ## isNmstrt2, \
170 E ## isNmstrt3, \
171 E ## isNmstrt4, \
172 E ## isInvalid2, \
173 E ## isInvalid3, \
174 E ## isInvalid4
175
176static int checkCharRefNumber(int);
177
178#include "xmltok_impl.h"
179#include "ascii.h"
180
181#ifdef XML_MIN_SIZE
182#define sb_isNameMin isNever
183#define sb_isNmstrtMin isNever
184#endif
185
186#ifdef XML_MIN_SIZE
187#define MINBPC(enc) ((enc)->minBytesPerChar)
188#else
189/* minimum bytes per character */
190#define MINBPC(enc) 1
191#endif
192
193#define SB_BYTE_TYPE(enc, p)(((struct normal_encoding *)(enc))->type[(unsigned char)*(
p)])
\
194 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
195
196#ifdef XML_MIN_SIZE
197static
198int sb_byteType(const ENCODING *enc, const char *p)
199{
200 return SB_BYTE_TYPE(enc, p)(((struct normal_encoding *)(enc))->type[(unsigned char)*(
p)])
;
201}
202#define BYTE_TYPE(enc, p) \
203 (((const struct normal_encoding *)(enc))->byteType(enc, p))
204#else
205#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)(((struct normal_encoding *)(enc))->type[(unsigned char)*(
p)])
206#endif
207
208#ifdef XML_MIN_SIZE
209#define BYTE_TO_ASCII(enc, p) \
210 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
211static
212int sb_byteToAscii(const ENCODING *enc, const char *p)
213{
214 return *p;
215}
216#else
217#define BYTE_TO_ASCII(enc, p) (*(p))
218#endif
219
220#define IS_NAME_CHAR(enc, p, n) \
221 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
222#define IS_NMSTRT_CHAR(enc, p, n) \
223 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
224#define IS_INVALID_CHAR(enc, p, n) \
225 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
226
227#ifdef XML_MIN_SIZE
228#define IS_NAME_CHAR_MINBPC(enc, p) \
229 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
230#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
231 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
232#else
233#define IS_NAME_CHAR_MINBPC(enc, p) (0)
234#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
235#endif
236
237#ifdef XML_MIN_SIZE
238#define CHAR_MATCHES(enc, p, c) \
239 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
240static
241int sb_charMatches(const ENCODING *enc, const char *p, int c)
242{
243 return *p == c;
244}
245#else
246/* c is an ASCII character */
247#define CHAR_MATCHES(enc, p, c) (*(p) == c)
248#endif
249
250#define PREFIX(ident) normal_ ## ident
251#include "xmltok_impl.c"
252
253#undef MINBPC
254#undef BYTE_TYPE
255#undef BYTE_TO_ASCII
256#undef CHAR_MATCHES
257#undef IS_NAME_CHAR
258#undef IS_NAME_CHAR_MINBPC
259#undef IS_NMSTRT_CHAR
260#undef IS_NMSTRT_CHAR_MINBPC
261#undef IS_INVALID_CHAR
262
263enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
264 UTF8_cval1 = 0x00,
265 UTF8_cval2 = 0xc0,
266 UTF8_cval3 = 0xe0,
267 UTF8_cval4 = 0xf0
268};
269
270static
271void utf8_toUtf8(const ENCODING *enc,
272 const char **fromP, const char *fromLim,
273 char **toP, const char *toLim)
274{
275 char *to;
276 const char *from;
277 if (fromLim - *fromP > toLim - *toP) {
278 /* Avoid copying partial characters. */
279 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
280 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
281 break;
282 }
283 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
284 *to = *from;
285 *fromP = from;
286 *toP = to;
287}
288
289static
290void utf8_toUtf16(const ENCODING *enc,
291 const char **fromP, const char *fromLim,
292 unsigned short **toP, const unsigned short *toLim)
293{
294 unsigned short *to = *toP;
295 const char *from = *fromP;
296 while (from != fromLim && to != toLim) {
297 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
298 case BT_LEAD2:
299 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
300 from += 2;
301 break;
302 case BT_LEAD3:
303 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
304 from += 3;
305 break;
306 case BT_LEAD4:
307 {
308 unsigned long n;
309 if (to + 1 == toLim)
310 break;
311 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
312 n -= 0x10000;
313 to[0] = (unsigned short)((n >> 10) | 0xD800);
314 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
315 to += 2;
316 from += 4;
317 }
318 break;
319 default:
320 *to++ = *from++;
321 break;
322 }
323 }
324 *fromP = from;
325 *toP = to;
326}
327
328#ifdef XML_NS
329static const struct normal_encoding utf8_encoding_ns = {
330 { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
331 {
332#include "asciitab.h"
333#include "utf8tab.h"
334 },
335 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3
, isNever, isNever, utf8_isInvalid3, utf8_isInvalid4
336};
337#endif
338
339static const struct normal_encoding utf8_encoding = {
340 { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
341 {
342#define BT_COLON BT_NMSTRT
343#include "asciitab.h"
344#undef BT_COLON
345#include "utf8tab.h"
346 },
347 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3
, isNever, isNever, utf8_isInvalid3, utf8_isInvalid4
348};
349
350#ifdef XML_NS
351
352static const struct normal_encoding internal_utf8_encoding_ns = {
353 { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
354 {
355#include "iasciitab.h"
356#include "utf8tab.h"
357 },
358 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3
, isNever, isNever, utf8_isInvalid3, utf8_isInvalid4
359};
360
361#endif
362
363static const struct normal_encoding internal_utf8_encoding = {
364 { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
365 {
366#define BT_COLON BT_NMSTRT
367#include "iasciitab.h"
368#undef BT_COLON
369#include "utf8tab.h"
370 },
371 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3
, isNever, isNever, utf8_isInvalid3, utf8_isInvalid4
372};
373
374static
375void latin1_toUtf8(const ENCODING *enc,
376 const char **fromP, const char *fromLim,
377 char **toP, const char *toLim)
378{
379 for (;;) {
380 unsigned char c;
381 if (*fromP == fromLim)
382 break;
383 c = (unsigned char)**fromP;
384 if (c & 0x80) {
385 if (toLim - *toP < 2)
386 break;
387 *(*toP)++ = ((c >> 6) | UTF8_cval2);
388 *(*toP)++ = ((c & 0x3f) | 0x80);
389 (*fromP)++;
390 }
391 else {
392 if (*toP == toLim)
393 break;
394 *(*toP)++ = *(*fromP)++;
395 }
396 }
397}
398
399static
400void latin1_toUtf16(const ENCODING *enc,
401 const char **fromP, const char *fromLim,
402 unsigned short **toP, const unsigned short *toLim)
403{
404 while (*fromP != fromLim && *toP != toLim)
405 *(*toP)++ = (unsigned char)*(*fromP)++;
406}
407
408#ifdef XML_NS
409
410static const struct normal_encoding latin1_encoding_ns = {
411 { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
412 {
413#include "asciitab.h"
414#include "latin1tab.h"
415 },
416 STANDARD_VTABLE(sb_)
417};
418
419#endif
420
421static const struct normal_encoding latin1_encoding = {
422 { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
423 {
424#define BT_COLON BT_NMSTRT
425#include "asciitab.h"
426#undef BT_COLON
427#include "latin1tab.h"
428 },
429 STANDARD_VTABLE(sb_)
430};
431
432static
433void ascii_toUtf8(const ENCODING *enc,
434 const char **fromP, const char *fromLim,
435 char **toP, const char *toLim)
436{
437 while (*fromP != fromLim && *toP != toLim)
438 *(*toP)++ = *(*fromP)++;
439}
440
441#ifdef XML_NS
442
443static const struct normal_encoding ascii_encoding_ns = {
444 { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
445 {
446#include "asciitab.h"
447/* BT_NONXML == 0 */
448 },
449 STANDARD_VTABLE(sb_)
450};
451
452#endif
453
454static const struct normal_encoding ascii_encoding = {
455 { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
456 {
457#define BT_COLON BT_NMSTRT
458#include "asciitab.h"
459#undef BT_COLON
460/* BT_NONXML == 0 */
461 },
462 STANDARD_VTABLE(sb_)
463};
464
465static int unicode_byte_type(char hi, char lo)
466{
467 switch ((unsigned char)hi) {
468 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
469 return BT_LEAD4;
470 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
471 return BT_TRAIL;
472 case 0xFF:
473 switch ((unsigned char)lo) {
474 case 0xFF:
475 case 0xFE:
476 return BT_NONXML;
477 }
478 break;
479 }
480 return BT_NONASCII;
481}
482
483#define DEFINE_UTF16_TO_UTF8(E)static void EtoUtf8(const ENCODING *enc, const char **fromP, const
char *fromLim, char **toP, const char *toLim) { const char *
from; for (from = *fromP; from != fromLim; from += 2) { int plane
; unsigned char lo2; unsigned char lo = GET_LO(from); unsigned
char hi = GET_HI(from); switch (hi) { case 0: if (lo < 0x80
) { if (*toP == toLim) { *fromP = from; return; } *(*toP)++ =
lo; break; } case 0x1: case 0x2: case 0x3: case 0x4: case 0x5
: case 0x6: case 0x7: if (toLim - *toP < 2) { *fromP = from
; return; } *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2
); *(*toP)++ = ((lo & 0x3f) | 0x80); break; default: if (
toLim - *toP < 3) { *fromP = from; return; } *(*toP)++ = (
(hi >> 4) | UTF8_cval3); *(*toP)++ = (((hi & 0xf) <<
2) | (lo >> 6) | 0x80); *(*toP)++ = ((lo & 0x3f) |
0x80); break; case 0xD8: case 0xD9: case 0xDA: case 0xDB: if
(toLim - *toP < 4) { *fromP = from; return; } plane = (((
hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;
*(*toP)++ = ((plane >> 2) | UTF8_cval4); *(*toP)++ = (
((lo >> 2) & 0xF) | ((plane & 0x3) << 4) |
0x80); from += 2; lo2 = GET_LO(from); *(*toP)++ = (((lo &
0x3) << 4) | ((GET_HI(from) & 0x3) << 2) | (
lo2 >> 6) | 0x80); *(*toP)++ = ((lo2 & 0x3f) | 0x80
); break; } } *fromP = from; }
\
484static \
485void E ## toUtf8(const ENCODING *enc, \
486 const char **fromP, const char *fromLim, \
487 char **toP, const char *toLim) \
488{ \
489 const char *from; \
490 for (from = *fromP; from != fromLim; from += 2) { \
491 int plane; \
492 unsigned char lo2; \
493 unsigned char lo = GET_LO(from); \
494 unsigned char hi = GET_HI(from); \
495 switch (hi) { \
496 case 0: \
497 if (lo < 0x80) { \
498 if (*toP == toLim) { \
499 *fromP = from; \
500 return; \
501 } \
502 *(*toP)++ = lo; \
503 break; \
504 } \
505 /* fall through */ \
506 case 0x1: case 0x2: case 0x3: \
507 case 0x4: case 0x5: case 0x6: case 0x7: \
508 if (toLim - *toP < 2) { \
509 *fromP = from; \
510 return; \
511 } \
512 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
513 *(*toP)++ = ((lo & 0x3f) | 0x80); \
514 break; \
515 default: \
516 if (toLim - *toP < 3) { \
517 *fromP = from; \
518 return; \
519 } \
520 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
521 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
522 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
523 *(*toP)++ = ((lo & 0x3f) | 0x80); \
524 break; \
525 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
526 if (toLim - *toP < 4) { \
527 *fromP = from; \
528 return; \
529 } \
530 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
531 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
532 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
533 from += 2; \
534 lo2 = GET_LO(from); \
535 *(*toP)++ = (((lo & 0x3) << 4) \
536 | ((GET_HI(from) & 0x3) << 2) \
537 | (lo2 >> 6) \
538 | 0x80); \
539 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
540 break; \
541 } \
542 } \
543 *fromP = from; \
544}
545
546#define DEFINE_UTF16_TO_UTF16(E)static void EtoUtf16(const ENCODING *enc, const char **fromP,
const char *fromLim, unsigned short **toP, const unsigned short
*toLim) { if (fromLim - *fromP > ((toLim - *toP) <<
1) && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) fromLim
-= 2; for (; *fromP != fromLim && *toP != toLim; *fromP
+= 2) *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP
); }
\
547static \
548void E ## toUtf16(const ENCODING *enc, \
549 const char **fromP, const char *fromLim, \
550 unsigned short **toP, const unsigned short *toLim) \
551{ \
552 /* Avoid copying first half only of surrogate */ \
553 if (fromLim - *fromP > ((toLim - *toP) << 1) \
554 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
555 fromLim -= 2; \
556 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
557 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
558}
559
560#define SET2(ptr, ch) \
561 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
562#define GET_LO(ptr) ((unsigned char)(ptr)[0])
563#define GET_HI(ptr) ((unsigned char)(ptr)[1])
564
565DEFINE_UTF16_TO_UTF8(little2_)static void little2_toUtf8(const ENCODING *enc, const char **
fromP, const char *fromLim, char **toP, const char *toLim) { const
char *from; for (from = *fromP; from != fromLim; from += 2) {
int plane; unsigned char lo2; unsigned char lo = GET_LO(from
); unsigned char hi = GET_HI(from); switch (hi) { case 0: if (
lo < 0x80) { if (*toP == toLim) { *fromP = from; return; }
*(*toP)++ = lo; break; } case 0x1: case 0x2: case 0x3: case 0x4
: case 0x5: case 0x6: case 0x7: if (toLim - *toP < 2) { *fromP
= from; return; } *(*toP)++ = ((lo >> 6) | (hi <<
2) | UTF8_cval2); *(*toP)++ = ((lo & 0x3f) | 0x80); break
; default: if (toLim - *toP < 3) { *fromP = from; return; }
*(*toP)++ = ((hi >> 4) | UTF8_cval3); *(*toP)++ = (((hi
& 0xf) << 2) | (lo >> 6) | 0x80); *(*toP)++ =
((lo & 0x3f) | 0x80); break; case 0xD8: case 0xD9: case 0xDA
: case 0xDB: if (toLim - *toP < 4) { *fromP = from; return
; } plane = (((hi & 0x3) << 2) | ((lo >> 6) &
0x3)) + 1; *(*toP)++ = ((plane >> 2) | UTF8_cval4); *(
*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) <<
4) | 0x80); from += 2; lo2 = GET_LO(from); *(*toP)++ = (((lo
& 0x3) << 4) | ((GET_HI(from) & 0x3) << 2
) | (lo2 >> 6) | 0x80); *(*toP)++ = ((lo2 & 0x3f) |
0x80); break; } } *fromP = from; }
566DEFINE_UTF16_TO_UTF16(little2_)static void little2_toUtf16(const ENCODING *enc, const char *
*fromP, const char *fromLim, unsigned short **toP, const unsigned
short *toLim) { if (fromLim - *fromP > ((toLim - *toP) <<
1) && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) fromLim
-= 2; for (; *fromP != fromLim && *toP != toLim; *fromP
+= 2) *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP
); }
567
568#undef SET2
569#undef GET_LO
570#undef GET_HI
571
572#define SET2(ptr, ch) \
573 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
574#define GET_LO(ptr) ((unsigned char)(ptr)[1])
575#define GET_HI(ptr) ((unsigned char)(ptr)[0])
576
577DEFINE_UTF16_TO_UTF8(big2_)static void big2_toUtf8(const ENCODING *enc, const char **fromP
, const char *fromLim, char **toP, const char *toLim) { const
char *from; for (from = *fromP; from != fromLim; from += 2) {
int plane; unsigned char lo2; unsigned char lo = GET_LO(from
); unsigned char hi = GET_HI(from); switch (hi) { case 0: if (
lo < 0x80) { if (*toP == toLim) { *fromP = from; return; }
*(*toP)++ = lo; break; } case 0x1: case 0x2: case 0x3: case 0x4
: case 0x5: case 0x6: case 0x7: if (toLim - *toP < 2) { *fromP
= from; return; } *(*toP)++ = ((lo >> 6) | (hi <<
2) | UTF8_cval2); *(*toP)++ = ((lo & 0x3f) | 0x80); break
; default: if (toLim - *toP < 3) { *fromP = from; return; }
*(*toP)++ = ((hi >> 4) | UTF8_cval3); *(*toP)++ = (((hi
& 0xf) << 2) | (lo >> 6) | 0x80); *(*toP)++ =
((lo & 0x3f) | 0x80); break; case 0xD8: case 0xD9: case 0xDA
: case 0xDB: if (toLim - *toP < 4) { *fromP = from; return
; } plane = (((hi & 0x3) << 2) | ((lo >> 6) &
0x3)) + 1; *(*toP)++ = ((plane >> 2) | UTF8_cval4); *(
*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) <<
4) | 0x80); from += 2; lo2 = GET_LO(from); *(*toP)++ = (((lo
& 0x3) << 4) | ((GET_HI(from) & 0x3) << 2
) | (lo2 >> 6) | 0x80); *(*toP)++ = ((lo2 & 0x3f) |
0x80); break; } } *fromP = from; }
578DEFINE_UTF16_TO_UTF16(big2_)static void big2_toUtf16(const ENCODING *enc, const char **fromP
, const char *fromLim, unsigned short **toP, const unsigned short
*toLim) { if (fromLim - *fromP > ((toLim - *toP) <<
1) && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) fromLim
-= 2; for (; *fromP != fromLim && *toP != toLim; *fromP
+= 2) *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP
); }
579
580#undef SET2
581#undef GET_LO
582#undef GET_HI
583
584#define LITTLE2_BYTE_TYPE(enc, p)((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned
char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
\
585 ((p)[1] == 0 \
586 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
587 : unicode_byte_type((p)[1], (p)[0]))
588#define LITTLE2_BYTE_TO_ASCII(enc, p)((p)[1] == 0 ? (p)[0] : -1) ((p)[1] == 0 ? (p)[0] : -1)
589#define LITTLE2_CHAR_MATCHES(enc, p, c)((p)[1] == 0 && (p)[0] == c) ((p)[1] == 0 && (p)[0] == c)
590#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + (
((unsigned char)p[0]) >> 5)] & (1 << (((unsigned
char)p[0]) & 0x1F)))
\
591 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + (
((unsigned char)p[0]) >> 5)] & (1 << (((unsigned
char)p[0]) & 0x1F)))
592#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) +
(((unsigned char)p[0]) >> 5)] & (1 << (((unsigned
char)p[0]) & 0x1F)))
\
593 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) +
(((unsigned char)p[0]) >> 5)] & (1 << (((unsigned
char)p[0]) & 0x1F)))
594
595#ifdef XML_MIN_SIZE
596
597static
598int little2_byteType(const ENCODING *enc, const char *p)
599{
600 return LITTLE2_BYTE_TYPE(enc, p)((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned
char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
;
601}
602
603static
604int little2_byteToAscii(const ENCODING *enc, const char *p)
605{
606 return LITTLE2_BYTE_TO_ASCII(enc, p)((p)[1] == 0 ? (p)[0] : -1);
607}
608
609static
610int little2_charMatches(const ENCODING *enc, const char *p, int c)
611{
612 return LITTLE2_CHAR_MATCHES(enc, p, c)((p)[1] == 0 && (p)[0] == c);
613}
614
615static
616int little2_isNameMin(const ENCODING *enc, const char *p)
617{
618 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + (
((unsigned char)p[0]) >> 5)] & (1 << (((unsigned
char)p[0]) & 0x1F)))
;
619}
620
621static
622int little2_isNmstrtMin(const ENCODING *enc, const char *p)
623{
624 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) +
(((unsigned char)p[0]) >> 5)] & (1 << (((unsigned
char)p[0]) & 0x1F)))
;
625}
626
627#undef VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
628#define VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, little2_toUtf8, little2_toUtf16
629
630#else /* not XML_MIN_SIZE */
631
632#undef PREFIX
633#define PREFIX(ident) little2_ ## ident
634#define MINBPC(enc) 2
635/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
636#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned
char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
637#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)((p)[1] == 0 ? (p)[0] : -1)
638#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)((p)[1] == 0 && (p)[0] == c)
639#define IS_NAME_CHAR(enc, p, n) 0
640#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + (
((unsigned char)p[0]) >> 5)] & (1 << (((unsigned
char)p[0]) & 0x1F)))
641#define IS_NMSTRT_CHAR(enc, p, n) (0)
642#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) +
(((unsigned char)p[0]) >> 5)] & (1 << (((unsigned
char)p[0]) & 0x1F)))
643
644#include "xmltok_impl.c"
645
646#undef MINBPC
647#undef BYTE_TYPE
648#undef BYTE_TO_ASCII
649#undef CHAR_MATCHES
650#undef IS_NAME_CHAR
651#undef IS_NAME_CHAR_MINBPC
652#undef IS_NMSTRT_CHAR
653#undef IS_NMSTRT_CHAR_MINBPC
654#undef IS_INVALID_CHAR
655
656#endif /* not XML_MIN_SIZE */
657
658#ifdef XML_NS
659
660static const struct normal_encoding little2_encoding_ns = {
661 { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
, 2, 0,
662#if XML_BYTE_ORDER12 == 12
663 1
664#else
665 0
666#endif
667 },
668 {
669#include "asciitab.h"
670#include "latin1tab.h"
671 },
672 STANDARD_VTABLE(little2_)
673};
674
675#endif
676
677static const struct normal_encoding little2_encoding = {
678 { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
, 2, 0,
679#if XML_BYTE_ORDER12 == 12
680 1
681#else
682 0
683#endif
684 },
685 {
686#define BT_COLON BT_NMSTRT
687#include "asciitab.h"
688#undef BT_COLON
689#include "latin1tab.h"
690 },
691 STANDARD_VTABLE(little2_)
692};
693
694#if XML_BYTE_ORDER12 != 21
695
696#ifdef XML_NS
697
698static const struct normal_encoding internal_little2_encoding_ns = {
699 { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
, 2, 0, 1 },
700 {
701#include "iasciitab.h"
702#include "latin1tab.h"
703 },
704 STANDARD_VTABLE(little2_)
705};
706
707#endif
708
709static const struct normal_encoding internal_little2_encoding = {
710 { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
, 2, 0, 1 },
711 {
712#define BT_COLON BT_NMSTRT
713#include "iasciitab.h"
714#undef BT_COLON
715#include "latin1tab.h"
716 },
717 STANDARD_VTABLE(little2_)
718};
719
720#endif
721
722
723#define BIG2_BYTE_TYPE(enc, p)((p)[0] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned
char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
\
724 ((p)[0] == 0 \
725 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
726 : unicode_byte_type((p)[0], (p)[1]))
727#define BIG2_BYTE_TO_ASCII(enc, p)((p)[0] == 0 ? (p)[1] : -1) ((p)[0] == 0 ? (p)[1] : -1)
728#define BIG2_CHAR_MATCHES(enc, p, c)((p)[0] == 0 && (p)[1] == c) ((p)[0] == 0 && (p)[1] == c)
729#define BIG2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + (
((unsigned char)p[1]) >> 5)] & (1 << (((unsigned
char)p[1]) & 0x1F)))
\
730 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + (
((unsigned char)p[1]) >> 5)] & (1 << (((unsigned
char)p[1]) & 0x1F)))
731#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) +
(((unsigned char)p[1]) >> 5)] & (1 << (((unsigned
char)p[1]) & 0x1F)))
\
732 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) +
(((unsigned char)p[1]) >> 5)] & (1 << (((unsigned
char)p[1]) & 0x1F)))
733
734#ifdef XML_MIN_SIZE
735
736static
737int big2_byteType(const ENCODING *enc, const char *p)
738{
739 return BIG2_BYTE_TYPE(enc, p)((p)[0] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned
char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
;
740}
741
742static
743int big2_byteToAscii(const ENCODING *enc, const char *p)
744{
745 return BIG2_BYTE_TO_ASCII(enc, p)((p)[0] == 0 ? (p)[1] : -1);
746}
747
748static
749int big2_charMatches(const ENCODING *enc, const char *p, int c)
750{
751 return BIG2_CHAR_MATCHES(enc, p, c)((p)[0] == 0 && (p)[1] == c);
752}
753
754static
755int big2_isNameMin(const ENCODING *enc, const char *p)
756{
757 return BIG2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + (
((unsigned char)p[1]) >> 5)] & (1 << (((unsigned
char)p[1]) & 0x1F)))
;
758}
759
760static
761int big2_isNmstrtMin(const ENCODING *enc, const char *p)
762{
763 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) +
(((unsigned char)p[1]) >> 5)] & (1 << (((unsigned
char)p[1]) & 0x1F)))
;
764}
765
766#undef VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
767#define VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId)
, big2_toUtf8, big2_toUtf16
768
769#else /* not XML_MIN_SIZE */
770
771#undef PREFIX
772#define PREFIX(ident) big2_ ## ident
773#define MINBPC(enc) 2
774/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
775#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)((p)[0] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned
char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
776#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)((p)[0] == 0 ? (p)[1] : -1)
777#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)((p)[0] == 0 && (p)[1] == c)
778#define IS_NAME_CHAR(enc, p, n) 0
779#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + (
((unsigned char)p[1]) >> 5)] & (1 << (((unsigned
char)p[1]) & 0x1F)))
780#define IS_NMSTRT_CHAR(enc, p, n) (0)
781#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) +
(((unsigned char)p[1]) >> 5)] & (1 << (((unsigned
char)p[1]) & 0x1F)))
782
783#include "xmltok_impl.c"
784
785#undef MINBPC
786#undef BYTE_TYPE
787#undef BYTE_TO_ASCII
788#undef CHAR_MATCHES
789#undef IS_NAME_CHAR
790#undef IS_NAME_CHAR_MINBPC
791#undef IS_NMSTRT_CHAR
792#undef IS_NMSTRT_CHAR_MINBPC
793#undef IS_INVALID_CHAR
794
795#endif /* not XML_MIN_SIZE */
796
797#ifdef XML_NS
798
799static const struct normal_encoding big2_encoding_ns = {
800 { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
, 2, 0,
801#if XML_BYTE_ORDER12 == 21
802 1
803#else
804 0
805#endif
806 },
807 {
808#include "asciitab.h"
809#include "latin1tab.h"
810 },
811 STANDARD_VTABLE(big2_)
812};
813
814#endif
815
816static const struct normal_encoding big2_encoding = {
817 { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
, 2, 0,
818#if XML_BYTE_ORDER12 == 21
819 1
820#else
821 0
822#endif
823 },
824 {
825#define BT_COLON BT_NMSTRT
826#include "asciitab.h"
827#undef BT_COLON
828#include "latin1tab.h"
829 },
830 STANDARD_VTABLE(big2_)
831};
832
833#if XML_BYTE_ORDER12 != 12
834
835#ifdef XML_NS
836
837static const struct normal_encoding internal_big2_encoding_ns = {
838 { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
, 2, 0, 1 },
839 {
840#include "iasciitab.h"
841#include "latin1tab.h"
842 },
843 STANDARD_VTABLE(big2_)
844};
845
846#endif
847
848static const struct normal_encoding internal_big2_encoding = {
849 { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok
) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX
(entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii
), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX
(charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition
), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16)
, 2, 0, 1 },
850 {
851#define BT_COLON BT_NMSTRT
852#include "iasciitab.h"
853#undef BT_COLON
854#include "latin1tab.h"
855 },
856 STANDARD_VTABLE(big2_)
857};
858
859#endif
860
861#undef PREFIX
862
863static
864int streqci(const char *s1, const char *s2)
865{
866 for (;;) {
867 char c1 = *s1++;
868 char c2 = *s2++;
869 if (ASCII_a0x61 <= c1 && c1 <= ASCII_z0x7A)
870 c1 += ASCII_A0x41 - ASCII_a0x61;
871 if (ASCII_a0x61 <= c2 && c2 <= ASCII_z0x7A)
872 c2 += ASCII_A0x41 - ASCII_a0x61;
873 if (c1 != c2)
874 return 0;
875 if (!c1)
876 break;
877 }
878 return 1;
879}
880
881static
882void initUpdatePosition(const ENCODING *enc, const char *ptr,
883 const char *end, POSITION *pos)
884{
885 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
886}
887
888static
889int toAscii(const ENCODING *enc, const char *ptr, const char *end)
890{
891 char buf[1];
892 char *p = buf;
893 XmlUtf8Convert(enc, &ptr, end, &p, p + 1)(((enc)->utf8Convert)(enc, &ptr, end, &p, p + 1));
894 if (p == buf)
895 return -1;
896 else
897 return buf[0];
898}
899
900static
901int isSpace(int c)
902{
903 switch (c) {
904 case 0x20:
905 case 0xD:
906 case 0xA:
907 case 0x9:
908 return 1;
909 }
910 return 0;
911}
912
913/* Return 1 if there's just optional white space
914or there's an S followed by name=val. */
915static
916int parsePseudoAttribute(const ENCODING *enc,
917 const char *ptr,
918 const char *end,
919 const char **namePtr,
920 const char **nameEndPtr,
921 const char **valPtr,
922 const char **nextTokPtr)
923{
924 int c;
925 char open;
926 if (ptr == end) {
927 *namePtr = 0;
928 return 1;
929 }
930 if (!isSpace(toAscii(enc, ptr, end))) {
931 *nextTokPtr = ptr;
932 return 0;
933 }
934 do {
935 ptr += enc->minBytesPerChar;
936 } while (isSpace(toAscii(enc, ptr, end)));
937 if (ptr == end) {
938 *namePtr = 0;
939 return 1;
940 }
941 *namePtr = ptr;
942 for (;;) {
943 c = toAscii(enc, ptr, end);
944 if (c == -1) {
945 *nextTokPtr = ptr;
946 return 0;
947 }
948 if (c == ASCII_EQUALS0x3D) {
949 *nameEndPtr = ptr;
950 break;
951 }
952 if (isSpace(c)) {
953 *nameEndPtr = ptr;
954 do {
955 ptr += enc->minBytesPerChar;
956 } while (isSpace(c = toAscii(enc, ptr, end)));
957 if (c != ASCII_EQUALS0x3D) {
958 *nextTokPtr = ptr;
959 return 0;
960 }
961 break;
962 }
963 ptr += enc->minBytesPerChar;
964 }
965 if (ptr == *namePtr) {
966 *nextTokPtr = ptr;
967 return 0;
968 }
969 ptr += enc->minBytesPerChar;
970 c = toAscii(enc, ptr, end);
971 while (isSpace(c)) {
972 ptr += enc->minBytesPerChar;
973 c = toAscii(enc, ptr, end);
974 }
975 if (c != ASCII_QUOT0x22 && c != ASCII_APOS0x27) {
976 *nextTokPtr = ptr;
977 return 0;
978 }
979 open = c;
980 ptr += enc->minBytesPerChar;
981 *valPtr = ptr;
982 for (;; ptr += enc->minBytesPerChar) {
983 c = toAscii(enc, ptr, end);
984 if (c == open)
985 break;
986 if (!(ASCII_a0x61 <= c && c <= ASCII_z0x7A)
987 && !(ASCII_A0x41 <= c && c <= ASCII_Z0x5A)
988 && !(ASCII_00x30 <= c && c <= ASCII_90x39)
989 && c != ASCII_PERIOD0x2E
990 && c != ASCII_MINUS0x2D
991 && c != ASCII_UNDERSCORE0x5F) {
992 *nextTokPtr = ptr;
993 return 0;
994 }
995 }
996 *nextTokPtr = ptr + enc->minBytesPerChar;
997 return 1;
998}
999
1000static const char KW_version[] = {
1001 ASCII_v0x76, ASCII_e0x65, ASCII_r0x72, ASCII_s0x73, ASCII_i0x69, ASCII_o0x6F, ASCII_n0x6E, '\0'
1002};
1003
1004static const char KW_encoding[] = {
1005 ASCII_e0x65, ASCII_n0x6E, ASCII_c0x63, ASCII_o0x6F, ASCII_d0x64, ASCII_i0x69, ASCII_n0x6E, ASCII_g0x67, '\0'
1006};
1007
1008static const char KW_standalone[] = {
1009 ASCII_s0x73, ASCII_t0x74, ASCII_a0x61, ASCII_n0x6E, ASCII_d0x64, ASCII_a0x61, ASCII_l0x6C, ASCII_o0x6F, ASCII_n0x6E, ASCII_e0x65, '\0'
1010};
1011
1012static const char KW_yes[] = {
1013 ASCII_y0x79, ASCII_e0x65, ASCII_s0x73, '\0'
1014};
1015
1016static const char KW_no[] = {
1017 ASCII_n0x6E, ASCII_o0x6F, '\0'
1018};
1019
1020static
1021int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1022 const char *,
1023 const char *),
1024 int isGeneralTextEntity,
1025 const ENCODING *enc,
1026 const char *ptr,
1027 const char *end,
1028 const char **badPtr,
1029 const char **versionPtr,
1030 const char **versionEndPtr,
1031 const char **encodingName,
1032 const ENCODING **encoding,
1033 int *standalone)
1034{
1035 const char *val = 0;
1036 const char *name = 0;
1037 const char *nameEnd = 0;
1038 ptr += 5 * enc->minBytesPerChar;
1039 end -= 2 * enc->minBytesPerChar;
1040 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name) {
1041 *badPtr = ptr;
1042 return 0;
1043 }
1044 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)(((enc)->nameMatchesAscii)(enc, name, nameEnd, KW_version)
)
) {
1045 if (!isGeneralTextEntity) {
1046 *badPtr = name;
1047 return 0;
1048 }
1049 }
1050 else {
1051 if (versionPtr)
1052 *versionPtr = val;
1053 if (versionEndPtr)
1054 *versionEndPtr = ptr;
1055 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1056 *badPtr = ptr;
1057 return 0;
1058 }
1059 if (!name) {
1060 if (isGeneralTextEntity) {
1061 /* a TextDecl must have an EncodingDecl */
1062 *badPtr = ptr;
1063 return 0;
1064 }
1065 return 1;
1066 }
1067 }
1068 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)(((enc)->nameMatchesAscii)(enc, name, nameEnd, KW_encoding
))
) {
1069 int c = toAscii(enc, val, end);
1070 if (!(ASCII_a0x61 <= c && c <= ASCII_z0x7A) && !(ASCII_A0x41 <= c && c <= ASCII_Z0x5A)) {
1071 *badPtr = val;
1072 return 0;
1073 }
1074 if (encodingName)
1075 *encodingName = val;
1076 if (encoding)
1077 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1078 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1079 *badPtr = ptr;
1080 return 0;
1081 }
1082 if (!name)
1083 return 1;
1084 }
1085 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)(((enc)->nameMatchesAscii)(enc, name, nameEnd, KW_standalone
))
|| isGeneralTextEntity) {
1086 *badPtr = name;
1087 return 0;
1088 }
1089 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)(((enc)->nameMatchesAscii)(enc, val, ptr - enc->minBytesPerChar
, KW_yes))
) {
1090 if (standalone)
1091 *standalone = 1;
1092 }
1093 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)(((enc)->nameMatchesAscii)(enc, val, ptr - enc->minBytesPerChar
, KW_no))
) {
1094 if (standalone)
1095 *standalone = 0;
1096 }
1097 else {
1098 *badPtr = val;
1099 return 0;
1100 }
1101 while (isSpace(toAscii(enc, ptr, end)))
1102 ptr += enc->minBytesPerChar;
1103 if (ptr != end) {
1104 *badPtr = ptr;
1105 return 0;
1106 }
1107 return 1;
1108}
1109
1110static
1111int checkCharRefNumber(int result)
1112{
1113 switch (result >> 8) {
1114 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1115 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1116 return -1;
1117 case 0:
1118 if (latin1_encoding.type[result] == BT_NONXML)
1119 return -1;
1120 break;
1121 case 0xFF:
1122 if (result == 0xFFFE || result == 0xFFFF)
1123 return -1;
1124 break;
1125 }
1126 return result;
1127}
1128
1129int XmlUtf8Encode(int c, char *buf)
1130{
1131 enum {
1132 /* minN is minimum legal resulting value for N byte sequence */
1133 min2 = 0x80,
1134 min3 = 0x800,
1135 min4 = 0x10000
1136 };
1137
1138 if (c < 0)
1139 return 0;
1140 if (c < min2) {
1141 buf[0] = (c | UTF8_cval1);
1142 return 1;
1143 }
1144 if (c < min3) {
1145 buf[0] = ((c >> 6) | UTF8_cval2);
1146 buf[1] = ((c & 0x3f) | 0x80);
1147 return 2;
1148 }
1149 if (c < min4) {
1150 buf[0] = ((c >> 12) | UTF8_cval3);
1151 buf[1] = (((c >> 6) & 0x3f) | 0x80);
1152 buf[2] = ((c & 0x3f) | 0x80);
1153 return 3;
1154 }
1155 if (c < 0x110000) {
1156 buf[0] = ((c >> 18) | UTF8_cval4);
1157 buf[1] = (((c >> 12) & 0x3f) | 0x80);
1158 buf[2] = (((c >> 6) & 0x3f) | 0x80);
1159 buf[3] = ((c & 0x3f) | 0x80);
1160 return 4;
1161 }
1162 return 0;
1163}
1164
1165int XmlUtf16Encode(int charNum, unsigned short *buf)
1166{
1167 if (charNum < 0)
1168 return 0;
1169 if (charNum < 0x10000) {
1170 buf[0] = charNum;
1171 return 1;
1172 }
1173 if (charNum < 0x110000) {
1174 charNum -= 0x10000;
1175 buf[0] = (charNum >> 10) + 0xD800;
1176 buf[1] = (charNum & 0x3FF) + 0xDC00;
1177 return 2;
1178 }
1179 return 0;
1180}
1181
1182struct unknown_encoding {
1183 struct normal_encoding normal;
1184 int (*convert)(void *userData, const char *p);
1185 void *userData;
1186 unsigned short utf16[256];
1187 char utf8[256][4];
1188};
1189
1190int XmlSizeOfUnknownEncoding(void)
1191{
1192 return sizeof(struct unknown_encoding);
1193}
1194
1195static
1196int unknown_isName(const ENCODING *enc, const char *p)
1197{
1198 int c = ((const struct unknown_encoding *)enc)
1199 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1200 if (c & ~0xFFFF)
1201 return 0;
1202 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF)(namingBitmap[(namePages[c >> 8] << 3) + ((c &
0xFF) >> 5)] & (1 << ((c & 0xFF) & 0x1F
)))
;
1203}
1204
1205static
1206int unknown_isNmstrt(const ENCODING *enc, const char *p)
1207{
1208 int c = ((const struct unknown_encoding *)enc)
1209 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1210 if (c & ~0xFFFF)
1211 return 0;
1212 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF)(namingBitmap[(nmstrtPages[c >> 8] << 3) + ((c &
0xFF) >> 5)] & (1 << ((c & 0xFF) & 0x1F
)))
;
1213}
1214
1215static
1216int unknown_isInvalid(const ENCODING *enc, const char *p)
1217{
1218 int c = ((const struct unknown_encoding *)enc)
1219 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1220 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1221}
1222
1223static
1224void unknown_toUtf8(const ENCODING *enc,
1225 const char **fromP, const char *fromLim,
1226 char **toP, const char *toLim)
1227{
1228 char buf[XML_UTF8_ENCODE_MAX4];
1229 for (;;) {
1
Loop condition is true. Entering loop body
1230 const char *utf8;
1231 int n;
1232 if (*fromP == fromLim)
2
Taking false branch
1233 break;
1234 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1235 n = *utf8++;
1236 if (n == 0) {
3
Assuming 'n' is equal to 0
4
Taking true branch
1237 int c = ((const struct unknown_encoding *)enc)
1238 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1239 n = XmlUtf8Encode(c, buf);
1240 if (n > toLim - *toP)
5
Taking false branch
1241 break;
1242 utf8 = buf;
1243 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1244 - (BT_LEAD2 - 2);
1245 }
1246 else {
1247 if (n > toLim - *toP)
1248 break;
1249 (*fromP)++;
1250 }
1251 do {
1252 *(*toP)++ = *utf8++;
6
Assigned value is garbage or undefined
1253 } while (--n != 0);
1254 }
1255}
1256
1257static
1258void unknown_toUtf16(const ENCODING *enc,
1259 const char **fromP, const char *fromLim,
1260 unsigned short **toP, const unsigned short *toLim)
1261{
1262 while (*fromP != fromLim && *toP != toLim) {
1263 unsigned short c
1264 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1265 if (c == 0) {
1266 c = (unsigned short)((const struct unknown_encoding *)enc)
1267 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1268 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1269 - (BT_LEAD2 - 2);
1270 }
1271 else
1272 (*fromP)++;
1273 *(*toP)++ = c;
1274 }
1275}
1276
1277ENCODING *
1278XmlInitUnknownEncoding(void *mem,
1279 int *table,
1280 int (*convert)(void *userData, const char *p),
1281 void *userData)
1282{
1283 int i;
1284 struct unknown_encoding *e = mem;
1285 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1286 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1287 for (i = 0; i < 128; i++)
1288 if (latin1_encoding.type[i] != BT_OTHER
1289 && latin1_encoding.type[i] != BT_NONXML
1290 && table[i] != i)
1291 return 0;
1292 for (i = 0; i < 256; i++) {
1293 int c = table[i];
1294 if (c == -1) {
1295 e->normal.type[i] = BT_MALFORM;
1296 /* This shouldn't really get used. */
1297 e->utf16[i] = 0xFFFF;
1298 e->utf8[i][0] = 1;
1299 e->utf8[i][1] = 0;
1300 }
1301 else if (c < 0) {
1302 if (c < -4)
1303 return 0;
1304 e->normal.type[i] = BT_LEAD2 - (c + 2);
1305 e->utf8[i][0] = 0;
1306 e->utf16[i] = 0;
1307 }
1308 else if (c < 0x80) {
1309 if (latin1_encoding.type[c] != BT_OTHER
1310 && latin1_encoding.type[c] != BT_NONXML
1311 && c != i)
1312 return 0;
1313 e->normal.type[i] = latin1_encoding.type[c];
1314 e->utf8[i][0] = 1;
1315 e->utf8[i][1] = (char)c;
1316 e->utf16[i] = c == 0 ? 0xFFFF : c;
1317 }
1318 else if (checkCharRefNumber(c) < 0) {
1319 e->normal.type[i] = BT_NONXML;
1320 /* This shouldn't really get used. */
1321 e->utf16[i] = 0xFFFF;
1322 e->utf8[i][0] = 1;
1323 e->utf8[i][1] = 0;
1324 }
1325 else {
1326 if (c > 0xFFFF)
1327 return 0;
1328 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)(namingBitmap[(nmstrtPages[c >> 8] << 3) + ((c &
0xff) >> 5)] & (1 << ((c & 0xff) & 0x1F
)))
)
1329 e->normal.type[i] = BT_NMSTRT;
1330 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)(namingBitmap[(namePages[c >> 8] << 3) + ((c &
0xff) >> 5)] & (1 << ((c & 0xff) & 0x1F
)))
)
1331 e->normal.type[i] = BT_NAME;
1332 else
1333 e->normal.type[i] = BT_OTHER;
1334 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1335 e->utf16[i] = c;
1336 }
1337 }
1338 e->userData = userData;
1339 e->convert = convert;
1340 if (convert) {
1341 e->normal.isName2 = unknown_isName;
1342 e->normal.isName3 = unknown_isName;
1343 e->normal.isName4 = unknown_isName;
1344 e->normal.isNmstrt2 = unknown_isNmstrt;
1345 e->normal.isNmstrt3 = unknown_isNmstrt;
1346 e->normal.isNmstrt4 = unknown_isNmstrt;
1347 e->normal.isInvalid2 = unknown_isInvalid;
1348 e->normal.isInvalid3 = unknown_isInvalid;
1349 e->normal.isInvalid4 = unknown_isInvalid;
1350 }
1351 e->normal.enc.utf8Convert = unknown_toUtf8;
1352 e->normal.enc.utf16Convert = unknown_toUtf16;
1353 return &(e->normal.enc);
1354}
1355
1356/* If this enumeration is changed, getEncodingIndex and encodings
1357must also be changed. */
1358enum {
1359 UNKNOWN_ENC = -1,
1360 ISO_8859_1_ENC = 0,
1361 US_ASCII_ENC,
1362 UTF_8_ENC,
1363 UTF_16_ENC,
1364 UTF_16BE_ENC,
1365 UTF_16LE_ENC,
1366 /* must match encodingNames up to here */
1367 NO_ENC
1368};
1369
1370static const char KW_ISO_8859_1[] = {
1371 ASCII_I0x49, ASCII_S0x53, ASCII_O0x4F, ASCII_MINUS0x2D, ASCII_80x38, ASCII_80x38, ASCII_50x35, ASCII_90x39, ASCII_MINUS0x2D, ASCII_10x31, '\0'
1372};
1373static const char KW_US_ASCII[] = {
1374 ASCII_U0x55, ASCII_S0x53, ASCII_MINUS0x2D, ASCII_A0x41, ASCII_S0x53, ASCII_C0x43, ASCII_I0x49, ASCII_I0x49, '\0'
1375};
1376static const char KW_UTF_8[] = {
1377 ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_80x38, '\0'
1378};
1379static const char KW_UTF_16[] = {
1380 ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_10x31, ASCII_60x36, '\0'
1381};
1382static const char KW_UTF_16BE[] = {
1383 ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_10x31, ASCII_60x36, ASCII_B0x42, ASCII_E0x45, '\0'
1384};
1385static const char KW_UTF_16LE[] = {
1386 ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_10x31, ASCII_60x36, ASCII_L0x4C, ASCII_E0x45, '\0'
1387};
1388
1389static
1390int getEncodingIndex(const char *name)
1391{
1392 static const char *encodingNames[] = {
1393 KW_ISO_8859_1,
1394 KW_US_ASCII,
1395 KW_UTF_8,
1396 KW_UTF_16,
1397 KW_UTF_16BE,
1398 KW_UTF_16LE,
1399 };
1400 int i;
1401 if (name == 0)
1402 return NO_ENC;
1403 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1404 if (streqci(name, encodingNames[i]))
1405 return i;
1406 return UNKNOWN_ENC;
1407}
1408
1409/* For binary compatibility, we store the index of the encoding specified
1410at initialization in the isUtf16 member. */
1411
1412#define INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) ((int)(enc)->initEnc.isUtf16)
1413#define SET_INIT_ENC_INDEX(enc, i)((enc)->initEnc.isUtf16 = (char)i) ((enc)->initEnc.isUtf16 = (char)i)
1414
1415/* This is what detects the encoding.
1416encodingTable maps from encoding indices to encodings;
1417INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1418state is XML_CONTENT_STATE if we're parsing an external text entity,
1419and XML_PROLOG_STATE otherwise.
1420*/
1421
1422
1423static
1424int initScan(const ENCODING **encodingTable,
1425 const INIT_ENCODING *enc,
1426 int state,
1427 const char *ptr,
1428 const char *end,
1429 const char **nextTokPtr)
1430{
1431 const ENCODING **encPtr;
1432
1433 if (ptr == end)
1434 return XML_TOK_NONE-4;
1435 encPtr = enc->encPtr;
1436 if (ptr + 1 == end) {
1437 /* only a single byte available for auto-detection */
1438#ifndef XML_DTD /* FIXME */
1439 /* a well-formed document entity must have more than one byte */
1440 if (state != XML_CONTENT_STATE1)
1441 return XML_TOK_PARTIAL-1;
1442#endif
1443 /* so we're parsing an external text entity... */
1444 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1445 switch (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16)) {
1446 case UTF_16_ENC:
1447 case UTF_16LE_ENC:
1448 case UTF_16BE_ENC:
1449 return XML_TOK_PARTIAL-1;
1450 }
1451 switch ((unsigned char)*ptr) {
1452 case 0xFE:
1453 case 0xFF:
1454 case 0xEF: /* possibly first byte of UTF-8 BOM */
1455 if (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == ISO_8859_1_ENC
1456 && state == XML_CONTENT_STATE1)
1457 break;
1458 /* fall through */
1459 case 0x00:
1460 case 0x3C:
1461 return XML_TOK_PARTIAL-1;
1462 }
1463 }
1464 else {
1465 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1466 case 0xFEFF:
1467 if (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == ISO_8859_1_ENC
1468 && state == XML_CONTENT_STATE1)
1469 break;
1470 *nextTokPtr = ptr + 2;
1471 *encPtr = encodingTable[UTF_16BE_ENC];
1472 return XML_TOK_BOM14;
1473 /* 00 3C is handled in the default case */
1474 case 0x3C00:
1475 if ((INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == UTF_16BE_ENC
1476 || INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == UTF_16_ENC)
1477 && state == XML_CONTENT_STATE1)
1478 break;
1479 *encPtr = encodingTable[UTF_16LE_ENC];
1480 return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr
))
;
1481 case 0xFFFE:
1482 if (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == ISO_8859_1_ENC
1483 && state == XML_CONTENT_STATE1)
1484 break;
1485 *nextTokPtr = ptr + 2;
1486 *encPtr = encodingTable[UTF_16LE_ENC];
1487 return XML_TOK_BOM14;
1488 case 0xEFBB:
1489 /* Maybe a UTF-8 BOM (EF BB BF) */
1490 /* If there's an explicitly specified (external) encoding
1491 of ISO-8859-1 or some flavour of UTF-16
1492 and this is an external text entity,
1493 don't look for the BOM,
1494 because it might be a legal data. */
1495 if (state == XML_CONTENT_STATE1) {
1496 int e = INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16);
1497 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1498 break;
1499 }
1500 if (ptr + 2 == end)
1501 return XML_TOK_PARTIAL-1;
1502 if ((unsigned char)ptr[2] == 0xBF) {
1503 *nextTokPtr = ptr + 3;
1504 *encPtr = encodingTable[UTF_8_ENC];
1505 return XML_TOK_BOM14;
1506 }
1507 break;
1508 default:
1509 if (ptr[0] == '\0') {
1510 /* 0 isn't a legal data character. Furthermore a document entity can only
1511 start with ASCII characters. So the only way this can fail to be big-endian
1512 UTF-16 if it it's an external parsed general entity that's labelled as
1513 UTF-16LE. */
1514 if (state == XML_CONTENT_STATE1 && INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == UTF_16LE_ENC)
1515 break;
1516 *encPtr = encodingTable[UTF_16BE_ENC];
1517 return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr
))
;
1518 }
1519 else if (ptr[1] == '\0') {
1520 /* We could recover here in the case:
1521 - parsing an external entity
1522 - second byte is 0
1523 - no externally specified encoding
1524 - no encoding declaration
1525 by assuming UTF-16LE. But we don't, because this would mean when
1526 presented just with a single byte, we couldn't reliably determine
1527 whether we needed further bytes. */
1528 if (state == XML_CONTENT_STATE1)
1529 break;
1530 *encPtr = encodingTable[UTF_16LE_ENC];
1531 return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr
))
;
1532 }
1533 break;
1534 }
1535 }
1536 *encPtr = encodingTable[INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16)];
1537 return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr
))
;
1538}
1539
1540
1541#define NS(x) x
1542#define ns(x) x
1543#include "xmltok_ns.c"
1544#undef NS
1545#undef ns
1546
1547#ifdef XML_NS
1548
1549#define NS(x) x ## NS
1550#define ns(x) x ## _ns
1551
1552#include "xmltok_ns.c"
1553
1554#undef NS
1555#undef ns
1556
1557ENCODING *
1558XmlInitUnknownEncodingNS(void *mem,
1559 int *table,
1560 int (*convert)(void *userData, const char *p),
1561 void *userData)
1562{
1563 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1564 if (enc)
1565 ((struct normal_encoding *)enc)->type[ASCII_COLON0x3A] = BT_COLON;
1566 return enc;
1567}
1568
1569#endif /* XML_NS */