../../../../libs/xmlrpc-c/lib/libutil/utf8.c

Bug Summary

File:	src/mod/xml_int/mod_xml_rpc/../../../../libs/xmlrpc-c/lib/libutil/utf8.c
Location:	line 232, column 10
Description:	Assigned value is garbage or undefined

Annotated Source Code

** Redistribution and use in source and binary forms, with or without

** modification, are permitted provided that the following conditions

** are met:

** 1. Redistributions of source code must retain the above copyright

** notice, this list of conditions and the following disclaimer.

** 2. Redistributions in binary form must reproduce the above copyright

** notice, this list of conditions and the following disclaimer in the

** documentation and/or other materials provided with the distribution.

** 3. The name of the author may not be used to endorse or promote products

** derived from this software without specific prior written permission.

** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND

** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

** ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE

** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

** SUCH DAMAGE. */

/*=========================================================================

** XML-RPC UTF-8 Utilities

**=========================================================================

** Routines for validating, encoding and decoding UTF-8 data. We try to

** be very, very strict about invalid UTF-8 data.

** All of the code in this file assumes that your machine represents

** wchar_t as a 16-bit (or wider) character containing UCS-2 data. If this

** assumption is incorrect, you may need to replace this file.

** For lots of information on Unicode and UTF-8 decoding, see:

** http://www.cl.cam.ac.uk/~mgk25/unicode.html

#include <assert.h>

#include "int.h"

#include "xmlrpc_config.h"

#include "bool.h"

#include "xmlrpc-c/base.h"

/*=========================================================================

** Tables and Constants

**=========================================================================

** We use a variety of tables and constants to help decode and validate

** UTF-8 data.

static unsigned char utf8SeqLength[256] = {

/* utf8SeqLength[B] is the number of bytes in a UTF-8 sequence that starts

with byte B. Except zero indicates an illegal initial byte.

Fredrik Lundh's UTF-8 decoder Python 2.0 uses a similar table. But since

Python 2.0 has the icky CNRI license, I generated this table from scratch

and wrote my own decoder.

/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */

/* 0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

/* 1 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

/* 2 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

/* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

/* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

/* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

/* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

/* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

/* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

/* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

/* A */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

/* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

/* C */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

/* D */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

/* E */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

/* F */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0

};

/* The minimum legal character value for a UTF-8 sequence of the given

** length. We have to check this to avoid accepting "overlong" UTF-8

** sequences, which use more bytes than necessary to encode a given

** character. Such sequences are commonly used by evil people to bypass

** filters and security checks. This table is based on the UTF-8-test.txt

** file by Markus Kuhn <mkuhn@acm.org>. */

static uint32_t const utf8_min_char_for_length[] = {

0, /* Length 0: Not used (meaningless) */

0x0000, /* Length 1: Not used (special-cased) */

0x0080, /* Length 2 */

0x0800, /* Length 3 */

0x00010000, /* Length 4 */

0x00200000, /* Length 5 */

0x04000000 /* Length 6 */

};

100

/* This is the maximum legal 16-byte (UCS-2) character. Again, this

101

** information is based on UTF-8-test.txt. */

102

#define UCS2_MAX_LEGAL_CHARACTER(0xFFFD) (0xFFFD)

103

104

/* First and last UTF-16 surrogate characters. These are *not* legal UCS-2

105

** characters--they're used to code for UCS-4 characters when using

106

** UTF-16. They should never appear in decoded UTF-8 data! Again, these

107

** could hypothetically be used to bypass security measures on some machines.

108

** Based on UTF-8-test.txt. */

109

#define UTF16_FIRST_SURROGATE(0xD800) (0xD800)

110

#define UTF16_LAST_SURROGATE(0xDFFF) (0xDFFF)

111

112

/* Is the character 'c' a UTF-8 continuation character? */

113

#define IS_CONTINUATION(c)(((c) & 0xC0) == 0x80) (((c) & 0xC0) == 0x80)

114

115

#define MAX_ENCODED_BYTES(3) (3)

116

/* Maximum number of bytes needed to encode in UTF-8 a character

117

in the Basic Multilingual Plane.

118

119

120

121

#if HAVE_UNICODE_WCHAR1

122

123

124

static void

125

validateContinuation(xmlrpc_env * const envP,

126

char const c) {

127

128

if (!IS_CONTINUATION(c)(((c) & 0xC0) == 0x80))

129

xmlrpc_env_set_fault_formatted(

130

envP, XMLRPC_INVALID_UTF8_ERROR(-510),

131

"UTF-8 multibyte sequence contains character 0x%02x, "

132

"which does not indicate continuation.", c);

133

}

134

135

136

137

static void

138

validateUtf16(xmlrpc_env * const envP,

139

wchar_t const wc) {

140

141

if (wc > UCS2_MAX_LEGAL_CHARACTER(0xFFFD))

142

xmlrpc_env_set_fault_formatted(

143

envP, XMLRPC_INVALID_UTF8_ERROR(-510),

144

"UCS-2 characters > U+FFFD are illegal. String contains 0x%04x",

145

(unsigned)wc);

146

else if (UTF16_FIRST_SURROGATE(0xD800) <= wc && wc <= UTF16_LAST_SURROGATE(0xDFFF))

147

xmlrpc_env_set_fault_formatted(

148

envP, XMLRPC_INVALID_UTF8_ERROR(-510),

149

"UTF-16 surrogates may not appear in UTF-8 data. "

150

"String contains %04x", (unsigned)wc);

151

}

152

153

154

155

/* Microsoft Visual C in debug mode produces code that complains about

156

returning an undefined value from xmlrpc_datetime_new_str(). It's a bogus

157

complaint, because this function is defined to return nothing meaningful

158

those cases. So we disable the check.

159

160

#pragma runtime_checks("u", off)

161

162

static void

163

decodeMultibyte(xmlrpc_env * const envP,

164

const char * const utf8_seq,

165

size_t const length,

166

wchar_t * const wcP) {

167

/*----------------------------------------------------------------------------

168

Decode the multibyte UTF-8 sequence which is 'length' characters

169

at 'utf8_data'.

170

171

Return the character in UTF-16 format as *wcP.

172

-----------------------------------------------------------------------------*/

173

wchar_t wc;

←

'wc' declared without an initial value

→

174

175

assert(utf8_seq[0] & 0x80)((utf8_seq[0] & 0x80) ? (void) (0) : __assert_fail ("utf8_seq[0] & 0x80"
, "../../../../libs/xmlrpc-c/lib/libutil/utf8.c", 175, __PRETTY_FUNCTION__
)); /* High bit set: this is multibyte seq */

176

177

switch (length) {

←

Control jumps to the 'default' case at line 217

→

178

case 2:

179

/* 110xxxxx 10xxxxxx */

180

validateContinuation(envP, utf8_seq[1]);

181

182

if (!envP->fault_occurred)

183

wc = ((((wchar_t) (utf8_seq[0] & 0x1F)) << 6) |

184

(((wchar_t) (utf8_seq[1] & 0x3F))));

185

break;

186

187

case 3:

188

/* 1110xxxx 10xxxxxx 10xxxxxx */

189

validateContinuation(envP, utf8_seq[1]);

190

if (!envP->fault_occurred) {

191

validateContinuation(envP, utf8_seq[2]);

192

if (!envP->fault_occurred)

193

wc = ((((wchar_t) (utf8_seq[0] & 0x0F)) << 12) |

194

(((wchar_t) (utf8_seq[1] & 0x3F)) << 6) |

195

(((wchar_t) (utf8_seq[2] & 0x3F))));

196

}

197

break;

198

199

case 4:

200

/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */

201

case 5:

202

/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */

203

case 6:

204

/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */

205

/* This would require more than 16 bits in UTF-16, so

206

it can't be represented in UCS-2, so it's beyond

207

our capability. Characters in the BMP fit in 16

208

bits.

209

210

xmlrpc_env_set_fault_formatted(

211

envP, XMLRPC_INVALID_UTF8_ERROR(-510),

212

"UTF-8 string contains a character not in the "

213

"Basic Multilingual Plane (first byte 0x%02x)",

214

utf8_seq[0]);

215

break;

216

217

default:

218

xmlrpc_faultf(envP,

219

"Internal error: Impossible UTF-8 sequence length %u",

220

(unsigned)length);

221

}

222

223

if (!envP->fault_occurred)

←

Taking false branch

→

224

validateUtf16(envP, wc);

225

226

if (!envP->fault_occurred)

←

Taking false branch

→

227

if ((uint32_t)wc < utf8_min_char_for_length[length])

228

xmlrpc_env_set_fault_formatted(

229

envP, XMLRPC_INVALID_UTF8_ERROR(-510),

230

"Overlong UTF-8 sequence not allowed");

231

232

*wcP = wc;

←

Assigned value is garbage or undefined

233

}

234

235

#pragma runtime_checks("u", restore)

236

237

238

239

static void

240

decodeUtf8(xmlrpc_env * const envP,

241

const char * const utf8_data,

242

size_t const utf8_len,

243

wchar_t * const ioBuff,

244

size_t * const outBuffLenP) {

245

/*----------------------------------------------------------------------------

246

Decode to UCS-2 (or validate as UTF-8 that can be decoded to UCS-2)

247

a UTF-8 string. To validate, set ioBuff and outBuffLenP to NULL.

248

To decode, allocate a sufficiently large buffer, pass it as ioBuff,

249

and pass a pointer as as outBuffLenP. The data will be written to

250

the buffer, and the length to outBuffLenP.

251

252

We assume that wchar_t holds a single UCS-2 character in native-endian

253

byte ordering.

254

-----------------------------------------------------------------------------*/

255

size_t utf8Cursor;

256

size_t outPos;

257

258

XMLRPC_ASSERT_ENV_OK(envP)do if (!((envP) != ((void*)0) && (envP->fault_string
== ((void*)0)) && !(envP)->fault_occurred)) xmlrpc_assertion_failed
("../../../../libs/xmlrpc-c/lib/libutil/utf8.c", 258); while (
0);

259

XMLRPC_ASSERT_PTR_OK(utf8_data)do if (!((utf8_data) != ((void*)0))) xmlrpc_assertion_failed(
"../../../../libs/xmlrpc-c/lib/libutil/utf8.c", 259); while (
0);

260

XMLRPC_ASSERT((!ioBuff && !outBuffLenP) || (ioBuff && outBuffLenP))do if (!((!ioBuff && !outBuffLenP) || (ioBuff &&
outBuffLenP))) xmlrpc_assertion_failed("../../../../libs/xmlrpc-c/lib/libutil/utf8.c"
, 260); while (0);

261

262

for (utf8Cursor = 0, outPos = 0;

←

Loop condition is true. Entering loop body

→

263

utf8Cursor < utf8_len && !envP->fault_occurred;

←

Assuming 'utf8Cursor' is < 'utf8_len'

→

264

) {

265

266

char const init = utf8_data[utf8Cursor];

267

/* Initial byte of the UTF-8 sequence */

268

269

wchar_t wc;

270

271

if ((init & 0x80) == 0x00) {

←

Taking false branch

→

272

/* Convert ASCII character to wide character. */

273

wc = init;

274

++utf8Cursor;

275

} else {

276

/* Look up the length of this UTF-8 sequence. */

277

size_t const length = utf8SeqLength[(unsigned char) init];

278

279

if (length == 0)

←

Assuming 'length' is not equal to 0

→

←

Taking false branch

→

280

xmlrpc_env_set_fault_formatted(

281

envP, XMLRPC_INVALID_UTF8_ERROR(-510),

282

"Unrecognized UTF-8 initial byte value 0x%02x", init);

283

else {

284

/* Make sure we have enough bytes to convert. */

285

if (utf8Cursor + length > utf8_len) {

←

Taking false branch

→

286

xmlrpc_env_set_fault_formatted(

287

envP, XMLRPC_INVALID_UTF8_ERROR(-510),

288

"Invalid UTF-8 sequence indicates a %u-byte sequence "

289

"when only %u bytes are left in the string",

290

(unsigned)length, (unsigned)(utf8_len - utf8Cursor));

291

} else {

292

decodeMultibyte(envP, &utf8_data[utf8Cursor], length, &wc);

←

Calling 'decodeMultibyte'

→

293

294

/* Advance to the end of the sequence. */

295

utf8Cursor += length;

296

}

297

}

298

}

299

300

if (!envP->fault_occurred) {

301

/* If we have a buffer, write our character to it. */

302

if (ioBuff)

303

ioBuff[outPos++] = wc;

304

}

305

}

306

307

if (outBuffLenP)

308

*outBuffLenP = envP->fault_occurred ? 0 : outPos;

309

}

310

311

312

313

xmlrpc_mem_block *

314

xmlrpc_utf8_to_wcs(xmlrpc_env * const envP,

315

const char * const utf8_data,

316

size_t const utf8_len) {

317

/*----------------------------------------------------------------------------

318

Decode UTF-8 string to a "wide character string". This function

319

returns an xmlrpc_mem_block with an element type of wchar_t. Don't

320

try to intepret the block in a bytewise fashion--it won't work in

321

any useful or portable fashion.

322

323

For backward compatibility, we return a meaningful value even when we

324

fail. We return NULL when we fail.

325

-----------------------------------------------------------------------------*/

326

xmlrpc_mem_block * wcsP;

327

size_t wcs_length;

328

329

/* Allocate a memory block large enough to hold any possible output.

330

We assume that each byte of the input may decode to a whcar_t.

331

332

wcsP = XMLRPC_MEMBLOCK_NEW(wchar_t, envP, utf8_len)xmlrpc_mem_block_new((envP), sizeof(wchar_t) * (utf8_len));

333

if (!envP->fault_occurred) {

334

/* Decode the UTF-8 data. */

335

decodeUtf8(envP, utf8_data, utf8_len,

336

XMLRPC_MEMBLOCK_CONTENTS(wchar_t, wcsP)((wchar_t*) xmlrpc_mem_block_contents(wcsP)),

337

&wcs_length);

338

if (!envP->fault_occurred) {

339

/* We can't have overrun our buffer. */

340

XMLRPC_ASSERT(wcs_length <= utf8_len)do if (!(wcs_length <= utf8_len)) xmlrpc_assertion_failed(
"../../../../libs/xmlrpc-c/lib/libutil/utf8.c", 340); while (
0);

341

342

/* Correct the length of the memory block. */

343

XMLRPC_MEMBLOCK_RESIZE(wchar_t, envP, wcsP, wcs_length)xmlrpc_mem_block_resize(envP, wcsP, sizeof(wchar_t) * (wcs_length
));

344

}

345

if (envP->fault_occurred)

346

XMLRPC_MEMBLOCK_FREE(wchar_t, wcsP)xmlrpc_mem_block_free(wcsP);

347

}

348

if (envP->fault_occurred)

349

return NULL((void*)0);

350

else

351

return wcsP;

352

}

353

354

355

356

xmlrpc_mem_block *

357

xmlrpc_wcs_to_utf8(xmlrpc_env * const envP,

358

const wchar_t * const wcs_data,

359

size_t const wcs_len) {

360

/*----------------------------------------------------------------------------

361

Encode a "wide character string" as UTF-8.

362

363

For backward compatibility, we return a meaningful value even when we

364

fail. We return NULL when we fail.

365

-----------------------------------------------------------------------------*/

366

size_t const estimate = wcs_len * MAX_ENCODED_BYTES(3);

367

/* Our conservative estimate of how big the output will be;

368

i.e. we know it won't be larger than this. For the estimate,

369

we assume that every wchar might encode to the maximum length.

370

371

xmlrpc_mem_block * utf8P;

372

373

374

XMLRPC_ASSERT_PTR_OK(wcs_data)do if (!((wcs_data) != ((void*)0))) xmlrpc_assertion_failed("../../../../libs/xmlrpc-c/lib/libutil/utf8.c"
, 374); while (0);

375

376

utf8P = XMLRPC_MEMBLOCK_NEW(char, envP, estimate)xmlrpc_mem_block_new((envP), sizeof(char) * (estimate));

377

if (!envP->fault_occurred) {

378

unsigned char * const buffer =

379

XMLRPC_MEMBLOCK_CONTENTS(unsigned char, utf8P)((unsigned char*) xmlrpc_mem_block_contents(utf8P));

380

size_t bytesUsed;

381

size_t i;

382

383

bytesUsed = 0;

384

for (i = 0; i < wcs_len && !envP->fault_occurred; ++i) {

385

wchar_t const wc = wcs_data[i];

386

if (wc <= 0x007F)

387

buffer[bytesUsed++] = wc & 0x7F;

388

else if (wc <= 0x07FF) {

389

/* 110xxxxx 10xxxxxx */

390

buffer[bytesUsed++] = 0xC0 | (wc >> 6);

391

buffer[bytesUsed++] = 0x80 | (wc & 0x3F);

392

} else if (wc <= 0xFFFF) {

393

/* 1110xxxx 10xxxxxx 10xxxxxx */

394

buffer[bytesUsed++] = 0xE0 | (wc >> 12);

395

buffer[bytesUsed++] = 0x80 | ((wc >> 6) & 0x3F);

396

buffer[bytesUsed++] = 0x80 | (wc & 0x3F);

397

} else

398

xmlrpc_faultf(envP,

399

"Don't know how to encode UCS-4 characters yet");

400

}

401

if (!envP->fault_occurred) {

402

XMLRPC_ASSERT(bytesUsed <= estimate)do if (!(bytesUsed <= estimate)) xmlrpc_assertion_failed("../../../../libs/xmlrpc-c/lib/libutil/utf8.c"
, 402); while (0);

403

404

XMLRPC_MEMBLOCK_RESIZE(char, envP, utf8P, bytesUsed)xmlrpc_mem_block_resize(envP, utf8P, sizeof(char) * (bytesUsed
));

405

}

406

if (envP->fault_occurred)

407

XMLRPC_MEMBLOCK_FREE(char, utf8P)xmlrpc_mem_block_free(utf8P);

408

}

409

410

if (envP->fault_occurred)

411

return NULL((void*)0);

412

else

413

return utf8P;

414

}

415

416

417

418

#else /* HAVE_UNICODE_WCHAR */

419

420

xmlrpc_mem_block *

421

xmlrpc_utf8_to_wcs(xmlrpc_env * const envP,

422

const char * const utf8_data ATTR_UNUSED__attribute__((__unused__)),

423

size_t const utf8_len ATTR_UNUSED__attribute__((__unused__))) {

424

425

xmlrpc_faultf(envP, "INTERNAL ERROR: xmlrpc_utf8_to_wcs() called "

426

"on a system that doesn't do Unicode!");

427

428

return NULL((void*)0);

429

}

430

#endif /* HAVE_UNICODE_WCHAR */

431

432

433

void

434

xmlrpc_force_to_utf8(char * const buffer) {

435

/*----------------------------------------------------------------------------

436

Force the contents of 'buffer' to be valid UTF-8, any way possible.

437

The buffer ends with a NUL character, and the mutation does not make

438

it longer.

439

440

The most common reason for a string that's supposed to be UTF-8 not

441

to be UTF-8 is that it was supposed to be ASCII but instead

442

includes garbage with the high bit on (ASCII characters always have

443

the high bit off), or maybe a primitive 8-bit ASCII extension.

444

Therefore, we force it to UTF-8 by replacing some bytes that have

445

the high bit set with DEL (0x7F). That would leave the other

446

characters meaningful.

447

-----------------------------------------------------------------------------*/

448

char * p;

449

450

for (p = &buffer[0]; *p;) {

451

unsigned int const length = utf8SeqLength[(unsigned char) *p];

452

453

bool forceDel;

454

uint32_t decoded;

455

456

forceDel = false; /* initial value */

457

458

switch (length) {

459

case 1:

460

/* One-byte UTF-8 characters are easy. */

461

decoded = *p;

462

break;

463

case 2:

464

/* 110xxxxx 10xxxxxx */

465

if (!*(p+1) || !(*p+2))

466

forceDel = true;

467

else if (!IS_CONTINUATION(*(p+1))(((*(p+1)) & 0xC0) == 0x80))

468

forceDel = true;

469

else

470

decoded =

471

((uint32_t)(*(p+0) & 0x1F) << 6) |

472

((uint32_t)(*(p+1) & 0x3F) << 0);

473

break;

474

case 3:

475

/* 1110xxxx 10xxxxxx 10xxxxxx */

476

if (!*(p+1) || !(*p+2) || !(*p+3))

477

forceDel = true;

478

else if (!IS_CONTINUATION(*(p+1))(((*(p+1)) & 0xC0) == 0x80) || !IS_CONTINUATION(*(p+2))(((*(p+2)) & 0xC0) == 0x80))

479

forceDel = true;

480

else

481

decoded =

482

((uint32_t)(*(p+0) & 0x0F) << 12) |

483

((uint32_t)(*(p+1) & 0x3F) << 6) |

484

((uint32_t)(*(p+2) & 0x3F) << 0);

485

break;

486

default:

487

forceDel = true;

488

}

489

490

if (!forceDel) {

491

if (decoded > UCS2_MAX_LEGAL_CHARACTER(0xFFFD))

492

forceDel = true;

493

else if (UTF16_FIRST_SURROGATE(0xD800) <= decoded &&

494

decoded <= UTF16_LAST_SURROGATE(0xDFFF))

495

forceDel = true;

496

else if (decoded < utf8_min_char_for_length[length])

497

forceDel = true;

498

}

499

500

if (forceDel) {

501

/* Not a valid UTF-8 character, so replace the first byte

502

with a nice simple ASCII DEL.

503

504

*p = 0x7F;

505

p += 1;

506

} else

507

p += length;

508

}

509

}

510

511

512

513

void

514

xmlrpc_force_to_xml_chars(char * const buffer) {

515

/*----------------------------------------------------------------------------

516

Modify 'buffer' so that it contains nothing but valid XML

517

characters. The buffer ends with a NUL character, and the mutation

518

does not make it longer.

519

520

Note that the valid characters in an XML document are all Unicode

521

codepoints except the ASCII control characters, plus CR, LF, and

522

Tab.

523

524

We change all non-XML characters to DEL (0x7F).

525

526

Assume input is valid UTF-8.

527

-----------------------------------------------------------------------------*/

528

char * p;

529

530

for (p = &buffer[0]; *p;) {

531

unsigned int const length = utf8SeqLength[(unsigned char) *p];

532

533

if (length == 1) {

534

if (*p < 0x20 && *p != '\r' && *p != '\n' && *p != '\t')

535

/* Not valid XML. Force to DEL */

536

*p = 0x7f;

537

} else {

538

/* We assume here that all other UTF-8 characters are

539

valid XML, but it's apparently not actually true.

540

541

}

542

543

{

544

unsigned int i;

545

/* Advance to next UTF-8 character */

546

for (i = 0; i < length && *p; ++i)

547

++p;

548

}

549

}

550

}

551

552

553

554

void

555

xmlrpc_validate_utf8(xmlrpc_env * const envP,

556

const char * const utf8_data,

557

size_t const utf8_len) {

558

/*----------------------------------------------------------------------------

559

Validate that a string is valid UTF-8.

560

-----------------------------------------------------------------------------*/

561

xmlrpc_env env;

562

563

xmlrpc_env_init(&env);

564

565

#if HAVE_UNICODE_WCHAR1

566

decodeUtf8(&env, utf8_data, utf8_len, NULL((void*)0), NULL((void*)0));

Calling 'decodeUtf8'

→

567

#else

568

/* We don't have a convenient way to validate, so we just fake it and

569

call it valid.

570

571

#endif

572

573

if (env.fault_occurred) {

574

xmlrpc_env_set_fault_formatted(

575

envP, XMLRPC_INVALID_UTF8_ERROR(-510),

576

"%" XMLRPC_PRId64"lld" "-byte "

577

"supposed UTF-8 string is not valid UTF-8. %s",

578

(XMLRPC_INT64long long)utf8_len, env.fault_string);

579

}

580

xmlrpc_env_clean(&env);

581

}