Bug Summary

File:src/mod/xml_int/mod_xml_rpc/../../../../libs/xmlrpc-c/lib/libutil/utf8.c
Location:line 377, column 10
Description:Access to field 'fault_occurred' results in a dereference of a null pointer (loaded from variable 'envP')

Annotated Source Code

1/* Copyright (C) 2001 by Eric Kidd. All rights reserved.
2**
3** Redistribution and use in source and binary forms, with or without
4** modification, are permitted provided that the following conditions
5** are met:
6** 1. Redistributions of source code must retain the above copyright
7** notice, this list of conditions and the following disclaimer.
8** 2. Redistributions in binary form must reproduce the above copyright
9** notice, this list of conditions and the following disclaimer in the
10** documentation and/or other materials provided with the distribution.
11** 3. The name of the author may not be used to endorse or promote products
12** derived from this software without specific prior written permission.
13**
14** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17** ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24** SUCH DAMAGE. */
25
26
27/*=========================================================================
28** XML-RPC UTF-8 Utilities
29**=========================================================================
30** Routines for validating, encoding and decoding UTF-8 data. We try to
31** be very, very strict about invalid UTF-8 data.
32**
33** All of the code in this file assumes that your machine represents
34** wchar_t as a 16-bit (or wider) character containing UCS-2 data. If this
35** assumption is incorrect, you may need to replace this file.
36**
37** For lots of information on Unicode and UTF-8 decoding, see:
38** http://www.cl.cam.ac.uk/~mgk25/unicode.html
39*/
40
41#include <assert.h>
42#include "int.h"
43
44#include "xmlrpc_config.h"
45#include "bool.h"
46#include "xmlrpc-c/base.h"
47
48/*=========================================================================
49** Tables and Constants
50**=========================================================================
51** We use a variety of tables and constants to help decode and validate
52** UTF-8 data.
53*/
54
55static unsigned char utf8SeqLength[256] = {
56
57 /* utf8SeqLength[B] is the number of bytes in a UTF-8 sequence that starts
58 with byte B. Except zero indicates an illegal initial byte.
59
60 Fredrik Lundh's UTF-8 decoder Python 2.0 uses a similar table. But since
61 Python 2.0 has the icky CNRI license, I generated this table from scratch
62 and wrote my own decoder.
63 */
64
65 /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
66 /* 0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67 /* 1 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
68 /* 2 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69 /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73 /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75 /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 /* A */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 /* C */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
79 /* D */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 /* E */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
81 /* F */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
82};
83
84/* The minimum legal character value for a UTF-8 sequence of the given
85** length. We have to check this to avoid accepting "overlong" UTF-8
86** sequences, which use more bytes than necessary to encode a given
87** character. Such sequences are commonly used by evil people to bypass
88** filters and security checks. This table is based on the UTF-8-test.txt
89** file by Markus Kuhn <mkuhn@acm.org>. */
90static uint32_t const utf8_min_char_for_length[] = {
91 0, /* Length 0: Not used (meaningless) */
92 0x0000, /* Length 1: Not used (special-cased) */
93 0x0080, /* Length 2 */
94 0x0800, /* Length 3 */
95 0x00010000, /* Length 4 */
96 0x00200000, /* Length 5 */
97 0x04000000 /* Length 6 */
98};
99
100/* This is the maximum legal 16-byte (UCS-2) character. Again, this
101** information is based on UTF-8-test.txt. */
102#define UCS2_MAX_LEGAL_CHARACTER(0xFFFD) (0xFFFD)
103
104/* First and last UTF-16 surrogate characters. These are *not* legal UCS-2
105** characters--they're used to code for UCS-4 characters when using
106** UTF-16. They should never appear in decoded UTF-8 data! Again, these
107** could hypothetically be used to bypass security measures on some machines.
108** Based on UTF-8-test.txt. */
109#define UTF16_FIRST_SURROGATE(0xD800) (0xD800)
110#define UTF16_LAST_SURROGATE(0xDFFF) (0xDFFF)
111
112/* Is the character 'c' a UTF-8 continuation character? */
113#define IS_CONTINUATION(c)(((c) & 0xC0) == 0x80) (((c) & 0xC0) == 0x80)
114
115#define MAX_ENCODED_BYTES(3) (3)
116 /* Maximum number of bytes needed to encode in UTF-8 a character
117 in the Basic Multilingual Plane.
118 */
119
120
121#if HAVE_UNICODE_WCHAR1
122
123
124static void
125validateContinuation(xmlrpc_env * const envP,
126 char const c) {
127
128 if (!IS_CONTINUATION(c)(((c) & 0xC0) == 0x80))
129 xmlrpc_env_set_fault_formatted(
130 envP, XMLRPC_INVALID_UTF8_ERROR(-510),
131 "UTF-8 multibyte sequence contains character 0x%02x, "
132 "which does not indicate continuation.", c);
133}
134
135
136
137static void
138validateUtf16(xmlrpc_env * const envP,
139 wchar_t const wc) {
140
141 if (wc > UCS2_MAX_LEGAL_CHARACTER(0xFFFD))
142 xmlrpc_env_set_fault_formatted(
143 envP, XMLRPC_INVALID_UTF8_ERROR(-510),
144 "UCS-2 characters > U+FFFD are illegal. String contains 0x%04x",
145 (unsigned)wc);
146 else if (UTF16_FIRST_SURROGATE(0xD800) <= wc && wc <= UTF16_LAST_SURROGATE(0xDFFF))
147 xmlrpc_env_set_fault_formatted(
148 envP, XMLRPC_INVALID_UTF8_ERROR(-510),
149 "UTF-16 surrogates may not appear in UTF-8 data. "
150 "String contains %04x", (unsigned)wc);
151}
152
153
154
155/* Microsoft Visual C in debug mode produces code that complains about
156 returning an undefined value from xmlrpc_datetime_new_str(). It's a bogus
157 complaint, because this function is defined to return nothing meaningful
158 those cases. So we disable the check.
159*/
160#pragma runtime_checks("u", off)
161
162static void
163decodeMultibyte(xmlrpc_env * const envP,
164 const char * const utf8_seq,
165 size_t const length,
166 wchar_t * const wcP) {
167/*----------------------------------------------------------------------------
168 Decode the multibyte UTF-8 sequence which is 'length' characters
169 at 'utf8_data'.
170
171 Return the character in UTF-16 format as *wcP.
172-----------------------------------------------------------------------------*/
173 wchar_t wc;
174
175 assert(utf8_seq[0] & 0x80)((utf8_seq[0] & 0x80) ? (void) (0) : __assert_fail ("utf8_seq[0] & 0x80"
, "../../../../libs/xmlrpc-c/lib/libutil/utf8.c", 175, __PRETTY_FUNCTION__
))
; /* High bit set: this is multibyte seq */
176
177 switch (length) {
178 case 2:
179 /* 110xxxxx 10xxxxxx */
180 validateContinuation(envP, utf8_seq[1]);
181
182 if (!envP->fault_occurred)
183 wc = ((((wchar_t) (utf8_seq[0] & 0x1F)) << 6) |
184 (((wchar_t) (utf8_seq[1] & 0x3F))));
185 break;
186
187 case 3:
188 /* 1110xxxx 10xxxxxx 10xxxxxx */
189 validateContinuation(envP, utf8_seq[1]);
190 if (!envP->fault_occurred) {
191 validateContinuation(envP, utf8_seq[2]);
192 if (!envP->fault_occurred)
193 wc = ((((wchar_t) (utf8_seq[0] & 0x0F)) << 12) |
194 (((wchar_t) (utf8_seq[1] & 0x3F)) << 6) |
195 (((wchar_t) (utf8_seq[2] & 0x3F))));
196 }
197 break;
198
199 case 4:
200 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
201 case 5:
202 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
203 case 6:
204 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
205 /* This would require more than 16 bits in UTF-16, so
206 it can't be represented in UCS-2, so it's beyond
207 our capability. Characters in the BMP fit in 16
208 bits.
209 */
210 xmlrpc_env_set_fault_formatted(
211 envP, XMLRPC_INVALID_UTF8_ERROR(-510),
212 "UTF-8 string contains a character not in the "
213 "Basic Multilingual Plane (first byte 0x%02x)",
214 utf8_seq[0]);
215 break;
216
217 default:
218 xmlrpc_faultf(envP,
219 "Internal error: Impossible UTF-8 sequence length %u",
220 (unsigned)length);
221 }
222
223 if (!envP->fault_occurred)
224 validateUtf16(envP, wc);
225
226 if (!envP->fault_occurred)
227 if ((uint32_t)wc < utf8_min_char_for_length[length])
228 xmlrpc_env_set_fault_formatted(
229 envP, XMLRPC_INVALID_UTF8_ERROR(-510),
230 "Overlong UTF-8 sequence not allowed");
231
232 *wcP = wc;
233}
234
235#pragma runtime_checks("u", restore)
236
237
238
239static void
240decodeUtf8(xmlrpc_env * const envP,
241 const char * const utf8_data,
242 size_t const utf8_len,
243 wchar_t * const ioBuff,
244 size_t * const outBuffLenP) {
245/*----------------------------------------------------------------------------
246 Decode to UCS-2 (or validate as UTF-8 that can be decoded to UCS-2)
247 a UTF-8 string. To validate, set ioBuff and outBuffLenP to NULL.
248 To decode, allocate a sufficiently large buffer, pass it as ioBuff,
249 and pass a pointer as as outBuffLenP. The data will be written to
250 the buffer, and the length to outBuffLenP.
251
252 We assume that wchar_t holds a single UCS-2 character in native-endian
253 byte ordering.
254-----------------------------------------------------------------------------*/
255 size_t utf8Cursor;
256 size_t outPos;
257
258 XMLRPC_ASSERT_ENV_OK(envP)do if (!((envP) != ((void*)0) && (envP->fault_string
== ((void*)0)) && !(envP)->fault_occurred)) xmlrpc_assertion_failed
("../../../../libs/xmlrpc-c/lib/libutil/utf8.c", 258); while (
0)
;
259 XMLRPC_ASSERT_PTR_OK(utf8_data)do if (!((utf8_data) != ((void*)0))) xmlrpc_assertion_failed(
"../../../../libs/xmlrpc-c/lib/libutil/utf8.c", 259); while (
0)
;
260 XMLRPC_ASSERT((!ioBuff && !outBuffLenP) || (ioBuff && outBuffLenP))do if (!((!ioBuff && !outBuffLenP) || (ioBuff &&
outBuffLenP))) xmlrpc_assertion_failed("../../../../libs/xmlrpc-c/lib/libutil/utf8.c"
, 260); while (0)
;
261
262 for (utf8Cursor = 0, outPos = 0;
263 utf8Cursor < utf8_len && !envP->fault_occurred;
264 ) {
265
266 char const init = utf8_data[utf8Cursor];
267 /* Initial byte of the UTF-8 sequence */
268
269 wchar_t wc;
270
271 if ((init & 0x80) == 0x00) {
272 /* Convert ASCII character to wide character. */
273 wc = init;
274 ++utf8Cursor;
275 } else {
276 /* Look up the length of this UTF-8 sequence. */
277 size_t const length = utf8SeqLength[(unsigned char) init];
278
279 if (length == 0)
280 xmlrpc_env_set_fault_formatted(
281 envP, XMLRPC_INVALID_UTF8_ERROR(-510),
282 "Unrecognized UTF-8 initial byte value 0x%02x", init);
283 else {
284 /* Make sure we have enough bytes to convert. */
285 if (utf8Cursor + length > utf8_len) {
286 xmlrpc_env_set_fault_formatted(
287 envP, XMLRPC_INVALID_UTF8_ERROR(-510),
288 "Invalid UTF-8 sequence indicates a %u-byte sequence "
289 "when only %u bytes are left in the string",
290 (unsigned)length, (unsigned)(utf8_len - utf8Cursor));
291 } else {
292 decodeMultibyte(envP, &utf8_data[utf8Cursor], length, &wc);
293
294 /* Advance to the end of the sequence. */
295 utf8Cursor += length;
296 }
297 }
298 }
299
300 if (!envP->fault_occurred) {
301 /* If we have a buffer, write our character to it. */
302 if (ioBuff)
303 ioBuff[outPos++] = wc;
304 }
305 }
306
307 if (outBuffLenP)
308 *outBuffLenP = envP->fault_occurred ? 0 : outPos;
309}
310
311
312
313xmlrpc_mem_block *
314xmlrpc_utf8_to_wcs(xmlrpc_env * const envP,
315 const char * const utf8_data,
316 size_t const utf8_len) {
317/*----------------------------------------------------------------------------
318 Decode UTF-8 string to a "wide character string". This function
319 returns an xmlrpc_mem_block with an element type of wchar_t. Don't
320 try to intepret the block in a bytewise fashion--it won't work in
321 any useful or portable fashion.
322
323 For backward compatibility, we return a meaningful value even when we
324 fail. We return NULL when we fail.
325-----------------------------------------------------------------------------*/
326 xmlrpc_mem_block * wcsP;
327 size_t wcs_length;
328
329 /* Allocate a memory block large enough to hold any possible output.
330 We assume that each byte of the input may decode to a whcar_t.
331 */
332 wcsP = XMLRPC_MEMBLOCK_NEW(wchar_t, envP, utf8_len)xmlrpc_mem_block_new((envP), sizeof(wchar_t) * (utf8_len));
333 if (!envP->fault_occurred) {
334 /* Decode the UTF-8 data. */
335 decodeUtf8(envP, utf8_data, utf8_len,
336 XMLRPC_MEMBLOCK_CONTENTS(wchar_t, wcsP)((wchar_t*) xmlrpc_mem_block_contents(wcsP)),
337 &wcs_length);
338 if (!envP->fault_occurred) {
339 /* We can't have overrun our buffer. */
340 XMLRPC_ASSERT(wcs_length <= utf8_len)do if (!(wcs_length <= utf8_len)) xmlrpc_assertion_failed(
"../../../../libs/xmlrpc-c/lib/libutil/utf8.c", 340); while (
0)
;
341
342 /* Correct the length of the memory block. */
343 XMLRPC_MEMBLOCK_RESIZE(wchar_t, envP, wcsP, wcs_length)xmlrpc_mem_block_resize(envP, wcsP, sizeof(wchar_t) * (wcs_length
))
;
344 }
345 if (envP->fault_occurred)
346 XMLRPC_MEMBLOCK_FREE(wchar_t, wcsP)xmlrpc_mem_block_free(wcsP);
347 }
348 if (envP->fault_occurred)
349 return NULL((void*)0);
350 else
351 return wcsP;
352}
353
354
355
356xmlrpc_mem_block *
357xmlrpc_wcs_to_utf8(xmlrpc_env * const envP,
358 const wchar_t * const wcs_data,
359 size_t const wcs_len) {
360/*----------------------------------------------------------------------------
361 Encode a "wide character string" as UTF-8.
362
363 For backward compatibility, we return a meaningful value even when we
364 fail. We return NULL when we fail.
365-----------------------------------------------------------------------------*/
366 size_t const estimate = wcs_len * MAX_ENCODED_BYTES(3);
367 /* Our conservative estimate of how big the output will be;
368 i.e. we know it won't be larger than this. For the estimate,
369 we assume that every wchar might encode to the maximum length.
370 */
371 xmlrpc_mem_block * utf8P;
372
373 XMLRPC_ASSERT_ENV_OK(envP)do if (!((envP) != ((void*)0) && (envP->fault_string
== ((void*)0)) && !(envP)->fault_occurred)) xmlrpc_assertion_failed
("../../../../libs/xmlrpc-c/lib/libutil/utf8.c", 373); while (
0)
;
1
Within the expansion of the macro 'XMLRPC_ASSERT_ENV_OK':
a
Assuming 'envP' is equal to null
374 XMLRPC_ASSERT_PTR_OK(wcs_data)do if (!((wcs_data) != ((void*)0))) xmlrpc_assertion_failed("../../../../libs/xmlrpc-c/lib/libutil/utf8.c"
, 374); while (0)
;
375
376 utf8P = XMLRPC_MEMBLOCK_NEW(char, envP, estimate)xmlrpc_mem_block_new((envP), sizeof(char) * (estimate));
377 if (!envP->fault_occurred) {
2
Access to field 'fault_occurred' results in a dereference of a null pointer (loaded from variable 'envP')
378 unsigned char * const buffer =
379 XMLRPC_MEMBLOCK_CONTENTS(unsigned char, utf8P)((unsigned char*) xmlrpc_mem_block_contents(utf8P));
380 size_t bytesUsed;
381 size_t i;
382
383 bytesUsed = 0;
384 for (i = 0; i < wcs_len && !envP->fault_occurred; ++i) {
385 wchar_t const wc = wcs_data[i];
386 if (wc <= 0x007F)
387 buffer[bytesUsed++] = wc & 0x7F;
388 else if (wc <= 0x07FF) {
389 /* 110xxxxx 10xxxxxx */
390 buffer[bytesUsed++] = 0xC0 | (wc >> 6);
391 buffer[bytesUsed++] = 0x80 | (wc & 0x3F);
392 } else if (wc <= 0xFFFF) {
393 /* 1110xxxx 10xxxxxx 10xxxxxx */
394 buffer[bytesUsed++] = 0xE0 | (wc >> 12);
395 buffer[bytesUsed++] = 0x80 | ((wc >> 6) & 0x3F);
396 buffer[bytesUsed++] = 0x80 | (wc & 0x3F);
397 } else
398 xmlrpc_faultf(envP,
399 "Don't know how to encode UCS-4 characters yet");
400 }
401 if (!envP->fault_occurred) {
402 XMLRPC_ASSERT(bytesUsed <= estimate)do if (!(bytesUsed <= estimate)) xmlrpc_assertion_failed("../../../../libs/xmlrpc-c/lib/libutil/utf8.c"
, 402); while (0)
;
403
404 XMLRPC_MEMBLOCK_RESIZE(char, envP, utf8P, bytesUsed)xmlrpc_mem_block_resize(envP, utf8P, sizeof(char) * (bytesUsed
))
;
405 }
406 if (envP->fault_occurred)
407 XMLRPC_MEMBLOCK_FREE(char, utf8P)xmlrpc_mem_block_free(utf8P);
408 }
409
410 if (envP->fault_occurred)
411 return NULL((void*)0);
412 else
413 return utf8P;
414}
415
416
417
418#else /* HAVE_UNICODE_WCHAR */
419
420xmlrpc_mem_block *
421xmlrpc_utf8_to_wcs(xmlrpc_env * const envP,
422 const char * const utf8_data ATTR_UNUSED__attribute__((__unused__)),
423 size_t const utf8_len ATTR_UNUSED__attribute__((__unused__))) {
424
425 xmlrpc_faultf(envP, "INTERNAL ERROR: xmlrpc_utf8_to_wcs() called "
426 "on a system that doesn't do Unicode!");
427
428 return NULL((void*)0);
429}
430#endif /* HAVE_UNICODE_WCHAR */
431
432
433void
434xmlrpc_force_to_utf8(char * const buffer) {
435/*----------------------------------------------------------------------------
436 Force the contents of 'buffer' to be valid UTF-8, any way possible.
437 The buffer ends with a NUL character, and the mutation does not make
438 it longer.
439
440 The most common reason for a string that's supposed to be UTF-8 not
441 to be UTF-8 is that it was supposed to be ASCII but instead
442 includes garbage with the high bit on (ASCII characters always have
443 the high bit off), or maybe a primitive 8-bit ASCII extension.
444 Therefore, we force it to UTF-8 by replacing some bytes that have
445 the high bit set with DEL (0x7F). That would leave the other
446 characters meaningful.
447-----------------------------------------------------------------------------*/
448 char * p;
449
450 for (p = &buffer[0]; *p;) {
451 unsigned int const length = utf8SeqLength[(unsigned char) *p];
452
453 bool forceDel;
454 uint32_t decoded;
455
456 forceDel = false; /* initial value */
457
458 switch (length) {
459 case 1:
460 /* One-byte UTF-8 characters are easy. */
461 decoded = *p;
462 break;
463 case 2:
464 /* 110xxxxx 10xxxxxx */
465 if (!*(p+1) || !(*p+2))
466 forceDel = true;
467 else if (!IS_CONTINUATION(*(p+1))(((*(p+1)) & 0xC0) == 0x80))
468 forceDel = true;
469 else
470 decoded =
471 ((uint32_t)(*(p+0) & 0x1F) << 6) |
472 ((uint32_t)(*(p+1) & 0x3F) << 0);
473 break;
474 case 3:
475 /* 1110xxxx 10xxxxxx 10xxxxxx */
476 if (!*(p+1) || !(*p+2) || !(*p+3))
477 forceDel = true;
478 else if (!IS_CONTINUATION(*(p+1))(((*(p+1)) & 0xC0) == 0x80) || !IS_CONTINUATION(*(p+2))(((*(p+2)) & 0xC0) == 0x80))
479 forceDel = true;
480 else
481 decoded =
482 ((uint32_t)(*(p+0) & 0x0F) << 12) |
483 ((uint32_t)(*(p+1) & 0x3F) << 6) |
484 ((uint32_t)(*(p+2) & 0x3F) << 0);
485 break;
486 default:
487 forceDel = true;
488 }
489
490 if (!forceDel) {
491 if (decoded > UCS2_MAX_LEGAL_CHARACTER(0xFFFD))
492 forceDel = true;
493 else if (UTF16_FIRST_SURROGATE(0xD800) <= decoded &&
494 decoded <= UTF16_LAST_SURROGATE(0xDFFF))
495 forceDel = true;
496 else if (decoded < utf8_min_char_for_length[length])
497 forceDel = true;
498 }
499
500 if (forceDel) {
501 /* Not a valid UTF-8 character, so replace the first byte
502 with a nice simple ASCII DEL.
503 */
504 *p = 0x7F;
505 p += 1;
506 } else
507 p += length;
508 }
509}
510
511
512
513void
514xmlrpc_force_to_xml_chars(char * const buffer) {
515/*----------------------------------------------------------------------------
516 Modify 'buffer' so that it contains nothing but valid XML
517 characters. The buffer ends with a NUL character, and the mutation
518 does not make it longer.
519
520 Note that the valid characters in an XML document are all Unicode
521 codepoints except the ASCII control characters, plus CR, LF, and
522 Tab.
523
524 We change all non-XML characters to DEL (0x7F).
525
526 Assume input is valid UTF-8.
527-----------------------------------------------------------------------------*/
528 char * p;
529
530 for (p = &buffer[0]; *p;) {
531 unsigned int const length = utf8SeqLength[(unsigned char) *p];
532
533 if (length == 1) {
534 if (*p < 0x20 && *p != '\r' && *p != '\n' && *p != '\t')
535 /* Not valid XML. Force to DEL */
536 *p = 0x7f;
537 } else {
538 /* We assume here that all other UTF-8 characters are
539 valid XML, but it's apparently not actually true.
540 */
541 }
542
543 {
544 unsigned int i;
545 /* Advance to next UTF-8 character */
546 for (i = 0; i < length && *p; ++i)
547 ++p;
548 }
549 }
550}
551
552
553
554void
555xmlrpc_validate_utf8(xmlrpc_env * const envP,
556 const char * const utf8_data,
557 size_t const utf8_len) {
558/*----------------------------------------------------------------------------
559 Validate that a string is valid UTF-8.
560-----------------------------------------------------------------------------*/
561 xmlrpc_env env;
562
563 xmlrpc_env_init(&env);
564
565#if HAVE_UNICODE_WCHAR1
566 decodeUtf8(&env, utf8_data, utf8_len, NULL((void*)0), NULL((void*)0));
567#else
568 /* We don't have a convenient way to validate, so we just fake it and
569 call it valid.
570 */
571#endif
572
573 if (env.fault_occurred) {
574 xmlrpc_env_set_fault_formatted(
575 envP, XMLRPC_INVALID_UTF8_ERROR(-510),
576 "%" XMLRPC_PRId64"lld" "-byte "
577 "supposed UTF-8 string is not valid UTF-8. %s",
578 (XMLRPC_INT64long long)utf8_len, env.fault_string);
579 }
580 xmlrpc_env_clean(&env);
581}