BOSS 8.0.0
BESIII Offline Software System
Loading...
Searching...
No Matches
xmltok.c
Go to the documentation of this file.
1/*
2The contents of this file are subject to the Mozilla Public License
3Version 1.1 (the "License"); you may not use this file except in
4compliance with the License. You may obtain a copy of the License at
5http://www.mozilla.org/MPL/
6
7Software distributed under the License is distributed on an "AS IS"
8basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9License for the specific language governing rights and limitations
10under the License.
11
12The Original Code is expat.
13
14The Initial Developer of the Original Code is James Clark.
15Portions created by James Clark are Copyright (C) 1998, 1999
16James Clark. All Rights Reserved.
17
18Contributor(s):
19
20Alternatively, the contents of this file may be used under the terms
21of the GNU General Public License (the "GPL"), in which case the
22provisions of the GPL are applicable instead of those above. If you
23wish to allow use of your version of this file only under the terms of
24the GPL and not to allow others to use your version of this file under
25the MPL, indicate your decision by deleting the provisions above and
26replace them with the notice and other provisions required by the
27GPL. If you do not delete the provisions above, a recipient may use
28your version of this file under either the MPL or the GPL.
29*/
30
31#ifdef HAVE_CONFIG_H
32# include "config.h"
33#endif
34
35#include "nametab.h"
36#include "xmldef.h"
37#include "xmltok.h"
38
39#define VTABLE1 \
40 { PREFIX( prologTok ), PREFIX( contentTok ), PREFIX( cdataSectionTok ) }, \
41 { PREFIX( attributeValueTok ), PREFIX( entityValueTok ) }, PREFIX( sameName ), \
42 PREFIX( nameMatchesAscii ), PREFIX( nameLength ), PREFIX( skipS ), PREFIX( getAtts ), \
43 PREFIX( charRefNumber ), PREFIX( predefinedEntityName ), PREFIX( updatePosition ), \
44 PREFIX( isPublicId )
45
46#define VTABLE VTABLE1, PREFIX( toUtf8 ), PREFIX( toUtf16 )
47
48#define UCS2_GET_NAMING( pages, hi, lo ) \
49 ( namingBitmap[( pages[hi] << 3 ) + ( ( lo ) >> 5 )] & ( 1 << ( ( lo ) & 0x1F ) ) )
50
51/* A 2 byte UTF-8 representation splits the characters 11 bits
52between the bottom 5 and 6 bits of the bytes.
53We need 8 bits to index into pages, 3 bits to add to that index and
545 bits to generate the mask. */
55#define UTF8_GET_NAMING2( pages, byte ) \
56 ( namingBitmap[( ( pages )[( ( ( byte )[0] ) >> 2 ) & 7] << 3 ) + \
57 ( ( ( ( byte )[0] ) & 3 ) << 1 ) + ( ( ( ( byte )[1] ) >> 5 ) & 1 )] & \
58 ( 1 << ( ( ( byte )[1] ) & 0x1F ) ) )
59
60/* A 3 byte UTF-8 representation splits the characters 16 bits
61between the bottom 4, 6 and 6 bits of the bytes.
62We need 8 bits to index into pages, 3 bits to add to that index and
635 bits to generate the mask. */
64#define UTF8_GET_NAMING3( pages, byte ) \
65 ( namingBitmap[( ( pages )[( ( ( ( byte )[0] ) & 0xF ) << 4 ) + \
66 ( ( ( ( byte )[1] ) >> 2 ) & 0xF )] \
67 << 3 ) + \
68 ( ( ( ( byte )[1] ) & 3 ) << 1 ) + ( ( ( ( byte )[2] ) >> 5 ) & 1 )] & \
69 ( 1 << ( ( ( byte )[2] ) & 0x1F ) ) )
70
71#define UTF8_GET_NAMING( pages, p, n ) \
72 ( ( n ) == 2 \
73 ? UTF8_GET_NAMING2( pages, (const unsigned char*)( p ) ) \
74 : ( ( n ) == 3 ? UTF8_GET_NAMING3( pages, (const unsigned char*)( p ) ) : 0 ) )
75
76#define UTF8_INVALID3( p ) \
77 ( ( *p ) == 0xED \
78 ? ( ( ( p )[1] & 0x20 ) != 0 ) \
79 : ( ( *p ) == 0xEF ? ( ( p )[1] == 0xBF && ( ( p )[2] == 0xBF || ( p )[2] == 0xBE ) ) \
80 : 0 ) )
81
82#define UTF8_INVALID4( p ) ( ( *p ) == 0xF4 && ( ( p )[1] & 0x30 ) != 0 )
83
84static int isNever( const ENCODING* enc, const char* p ) { return 0; }
85
86static int utf8_isName2( const ENCODING* enc, const char* p ) {
87 return UTF8_GET_NAMING2( namePages, (const unsigned char*)p );
88}
89
90static int utf8_isName3( const ENCODING* enc, const char* p ) {
91 return UTF8_GET_NAMING3( namePages, (const unsigned char*)p );
92}
93
94#define utf8_isName4 isNever
95
96static int utf8_isNmstrt2( const ENCODING* enc, const char* p ) {
97 return UTF8_GET_NAMING2( nmstrtPages, (const unsigned char*)p );
98}
99
100static int utf8_isNmstrt3( const ENCODING* enc, const char* p ) {
101 return UTF8_GET_NAMING3( nmstrtPages, (const unsigned char*)p );
102}
103
104#define utf8_isNmstrt4 isNever
105
106#define utf8_isInvalid2 isNever
107
108static int utf8_isInvalid3( const ENCODING* enc, const char* p ) {
109 return UTF8_INVALID3( (const unsigned char*)p );
110}
111
112static int utf8_isInvalid4( const ENCODING* enc, const char* p ) {
113 return UTF8_INVALID4( (const unsigned char*)p );
114}
115
117 ENCODING enc;
118 unsigned char type[256];
119#ifdef XML_MIN_SIZE
120 int ( *byteType )( const ENCODING*, const char* );
121 int ( *isNameMin )( const ENCODING*, const char* );
122 int ( *isNmstrtMin )( const ENCODING*, const char* );
123 int ( *byteToAscii )( const ENCODING*, const char* );
124 int ( *charMatches )( const ENCODING*, const char*, int );
125#endif /* XML_MIN_SIZE */
126 int ( *isName2 )( const ENCODING*, const char* );
127 int ( *isName3 )( const ENCODING*, const char* );
128 int ( *isName4 )( const ENCODING*, const char* );
129 int ( *isNmstrt2 )( const ENCODING*, const char* );
130 int ( *isNmstrt3 )( const ENCODING*, const char* );
131 int ( *isNmstrt4 )( const ENCODING*, const char* );
132 int ( *isInvalid2 )( const ENCODING*, const char* );
133 int ( *isInvalid3 )( const ENCODING*, const char* );
134 int ( *isInvalid4 )( const ENCODING*, const char* );
135};
136
137#ifdef XML_MIN_SIZE
138
139# define STANDARD_VTABLE( E ) \
140 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
141
142#else
143
144# define STANDARD_VTABLE( E ) /* as nothing */
145
146#endif
147
148#define NORMAL_VTABLE( E ) \
149 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, E##isNmstrt4, \
150 E##isInvalid2, E##isInvalid3, E##isInvalid4
151
152static int checkCharRefNumber( int );
153
154#include "xmltok_impl.h"
155
156#ifdef XML_MIN_SIZE
157# define sb_isNameMin isNever
158# define sb_isNmstrtMin isNever
159#endif
160
161#ifdef XML_MIN_SIZE
162# define MINBPC( enc ) ( ( enc )->minBytesPerChar )
163#else
164/* minimum bytes per character */
165# define MINBPC( enc ) 1
166#endif
167
168#define SB_BYTE_TYPE( enc, p ) \
169 ( ( (struct normal_encoding*)( enc ) )->type[(unsigned char)*( p )] )
170
171#ifdef XML_MIN_SIZE
172static int sb_byteType( const ENCODING* enc, const char* p ) { return SB_BYTE_TYPE( enc, p ); }
173# define BYTE_TYPE( enc, p ) \
174 ( ( (const struct normal_encoding*)( enc ) )->byteType( enc, p ) )
175#else
176# define BYTE_TYPE( enc, p ) SB_BYTE_TYPE( enc, p )
177#endif
178
179#ifdef XML_MIN_SIZE
180# define BYTE_TO_ASCII( enc, p ) \
181 ( ( (const struct normal_encoding*)( enc ) )->byteToAscii( enc, p ) )
182static int sb_byteToAscii( const ENCODING* enc, const char* p ) { return *p; }
183#else
184# define BYTE_TO_ASCII( enc, p ) ( *p )
185#endif
186
187#define IS_NAME_CHAR( enc, p, n ) \
188 ( ( (const struct normal_encoding*)( enc ) )->isName##n( enc, p ) )
189#define IS_NMSTRT_CHAR( enc, p, n ) \
190 ( ( (const struct normal_encoding*)( enc ) )->isNmstrt##n( enc, p ) )
191#define IS_INVALID_CHAR( enc, p, n ) \
192 ( ( (const struct normal_encoding*)( enc ) )->isInvalid##n( enc, p ) )
193
194#ifdef XML_MIN_SIZE
195# define IS_NAME_CHAR_MINBPC( enc, p ) \
196 ( ( (const struct normal_encoding*)( enc ) )->isNameMin( enc, p ) )
197# define IS_NMSTRT_CHAR_MINBPC( enc, p ) \
198 ( ( (const struct normal_encoding*)( enc ) )->isNmstrtMin( enc, p ) )
199#else
200# define IS_NAME_CHAR_MINBPC( enc, p ) ( 0 )
201# define IS_NMSTRT_CHAR_MINBPC( enc, p ) ( 0 )
202#endif
203
204#ifdef XML_MIN_SIZE
205# define CHAR_MATCHES( enc, p, c ) \
206 ( ( (const struct normal_encoding*)( enc ) )->charMatches( enc, p, c ) )
207static int sb_charMatches( const ENCODING* enc, const char* p, int c ) { return *p == c; }
208#else
209/* c is an ASCII character */
210# define CHAR_MATCHES( enc, p, c ) ( *( p ) == c )
211#endif
212
213#define PREFIX( ident ) normal_##ident
214#include "xmltok_impl.c"
215
216#undef MINBPC
217#undef BYTE_TYPE
218#undef BYTE_TO_ASCII
219#undef CHAR_MATCHES
220#undef IS_NAME_CHAR
221#undef IS_NAME_CHAR_MINBPC
222#undef IS_NMSTRT_CHAR
223#undef IS_NMSTRT_CHAR_MINBPC
224#undef IS_INVALID_CHAR
225
226enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
231};
232
233static void utf8_toUtf8( const ENCODING* enc, const char** fromP, const char* fromLim,
234 char** toP, const char* toLim ) {
235 char* to;
236 const char* from;
237 if ( fromLim - *fromP > toLim - *toP )
238 {
239 /* Avoid copying partial characters. */
240 for ( fromLim = *fromP + ( toLim - *toP ); fromLim > *fromP; fromLim-- )
241 if ( ( (unsigned char)fromLim[-1] & 0xc0 ) != 0x80 ) break;
242 }
243 for ( to = *toP, from = *fromP; from != fromLim; from++, to++ ) *to = *from;
244 *fromP = from;
245 *toP = to;
246}
247
248static void utf8_toUtf16( const ENCODING* enc, const char** fromP, const char* fromLim,
249 unsigned short** toP, const unsigned short* toLim ) {
250 unsigned short* to = *toP;
251 const char* from = *fromP;
252 while ( from != fromLim && to != toLim )
253 {
254 switch ( ( (struct normal_encoding*)enc )->type[(unsigned char)*from] )
255 {
256 case BT_LEAD2:
257 *to++ = ( ( from[0] & 0x1f ) << 6 ) | ( from[1] & 0x3f );
258 from += 2;
259 break;
260 case BT_LEAD3:
261 *to++ = ( ( from[0] & 0xf ) << 12 ) | ( ( from[1] & 0x3f ) << 6 ) | ( from[2] & 0x3f );
262 from += 3;
263 break;
264 case BT_LEAD4: {
265 unsigned long n;
266 if ( to + 1 == toLim ) break;
267 n = ( ( from[0] & 0x7 ) << 18 ) | ( ( from[1] & 0x3f ) << 12 ) |
268 ( ( from[2] & 0x3f ) << 6 ) | ( from[3] & 0x3f );
269 n -= 0x10000;
270 to[0] = (unsigned short)( ( n >> 10 ) | 0xD800 );
271 to[1] = (unsigned short)( ( n & 0x3FF ) | 0xDC00 );
272 to += 2;
273 from += 4;
274 }
275 break;
276 default: *to++ = *from++; break;
277 }
278 }
279 *fromP = from;
280 *toP = to;
281}
282
283#ifdef XML_NS
284static const struct normal_encoding utf8_encoding_ns = {
285 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
286 {
287# include "asciitab.h"
288# include "utf8tab.h"
289 },
290 STANDARD_VTABLE( sb_ ) NORMAL_VTABLE( utf8_ ) };
291#endif
292
293static const struct normal_encoding utf8_encoding = {
294 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
295 {
296#define BT_COLON BT_NMSTRT
297#include "asciitab.h"
298#undef BT_COLON
299#include "utf8tab.h"
300 },
301 STANDARD_VTABLE( sb_ ) NORMAL_VTABLE( utf8_ ) };
302
303#ifdef XML_NS
304
305static const struct normal_encoding internal_utf8_encoding_ns = {
306 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
307 {
308# include "iasciitab.h"
309# include "utf8tab.h"
310 },
311 STANDARD_VTABLE( sb_ ) NORMAL_VTABLE( utf8_ ) };
312
313#endif
314
315static const struct normal_encoding internal_utf8_encoding = {
316 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
317 {
318#define BT_COLON BT_NMSTRT
319#include "iasciitab.h"
320#undef BT_COLON
321#include "utf8tab.h"
322 },
323 STANDARD_VTABLE( sb_ ) NORMAL_VTABLE( utf8_ ) };
324
325static void latin1_toUtf8( const ENCODING* enc, const char** fromP, const char* fromLim,
326 char** toP, const char* toLim ) {
327 for ( ;; )
328 {
329 unsigned char c;
330 if ( *fromP == fromLim ) break;
331 c = (unsigned char)**fromP;
332 if ( c & 0x80 )
333 {
334 if ( toLim - *toP < 2 ) break;
335 *( *toP )++ = ( ( c >> 6 ) | UTF8_cval2 );
336 *( *toP )++ = ( ( c & 0x3f ) | 0x80 );
337 ( *fromP )++;
338 }
339 else
340 {
341 if ( *toP == toLim ) break;
342 *( *toP )++ = *( *fromP )++;
343 }
344 }
345}
346
347static void latin1_toUtf16( const ENCODING* enc, const char** fromP, const char* fromLim,
348 unsigned short** toP, const unsigned short* toLim ) {
349 while ( *fromP != fromLim && *toP != toLim ) *( *toP )++ = (unsigned char)*( *fromP )++;
350}
351
352#ifdef XML_NS
353
354static const struct normal_encoding latin1_encoding_ns = {
355 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
356 {
357# include "asciitab.h"
358# include "latin1tab.h"
359 },
360 STANDARD_VTABLE( sb_ ) };
361
362#endif
363
364static const struct normal_encoding latin1_encoding = {
365 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
366 {
367#define BT_COLON BT_NMSTRT
368#include "asciitab.h"
369#undef BT_COLON
370#include "latin1tab.h"
371 },
372 STANDARD_VTABLE( sb_ ) };
373
374static void ascii_toUtf8( const ENCODING* enc, const char** fromP, const char* fromLim,
375 char** toP, const char* toLim ) {
376 while ( *fromP != fromLim && *toP != toLim ) *( *toP )++ = *( *fromP )++;
377}
378
379#ifdef XML_NS
380
381static const struct normal_encoding ascii_encoding_ns = {
382 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
383 {
384# include "asciitab.h"
385 /* BT_NONXML == 0 */
386 },
387 STANDARD_VTABLE( sb_ ) };
388
389#endif
390
391static const struct normal_encoding ascii_encoding = {
392 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
393 {
394#define BT_COLON BT_NMSTRT
395#include "asciitab.h"
396#undef BT_COLON
397 /* BT_NONXML == 0 */
398 },
399 STANDARD_VTABLE( sb_ ) };
400
401static int unicode_byte_type( char hi, char lo ) {
402 switch ( (unsigned char)hi )
403 {
404 case 0xD8:
405 case 0xD9:
406 case 0xDA:
407 case 0xDB: return BT_LEAD4;
408 case 0xDC:
409 case 0xDD:
410 case 0xDE:
411 case 0xDF: return BT_TRAIL;
412 case 0xFF:
413 switch ( (unsigned char)lo )
414 {
415 case 0xFF:
416 case 0xFE: return BT_NONXML;
417 }
418 break;
419 }
420 return BT_NONASCII;
421}
422
423#define DEFINE_UTF16_TO_UTF8( E ) \
424 static void E##toUtf8( const ENCODING* enc, const char** fromP, const char* fromLim, \
425 char** toP, const char* toLim ) { \
426 const char* from; \
427 for ( from = *fromP; from != fromLim; from += 2 ) \
428 { \
429 int plane; \
430 unsigned char lo2; \
431 unsigned char lo = GET_LO( from ); \
432 unsigned char hi = GET_HI( from ); \
433 switch ( hi ) \
434 { \
435 case 0: \
436 if ( lo < 0x80 ) \
437 { \
438 if ( *toP == toLim ) \
439 { \
440 *fromP = from; \
441 return; \
442 } \
443 *( *toP )++ = lo; \
444 break; \
445 } \
446 /* fall through */ \
447 case 0x1: \
448 case 0x2: \
449 case 0x3: \
450 case 0x4: \
451 case 0x5: \
452 case 0x6: \
453 case 0x7: \
454 if ( toLim - *toP < 2 ) \
455 { \
456 *fromP = from; \
457 return; \
458 } \
459 *( *toP )++ = ( ( lo >> 6 ) | ( hi << 2 ) | UTF8_cval2 ); \
460 *( *toP )++ = ( ( lo & 0x3f ) | 0x80 ); \
461 break; \
462 default: \
463 if ( toLim - *toP < 3 ) \
464 { \
465 *fromP = from; \
466 return; \
467 } \
468 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
469 *( *toP )++ = ( ( hi >> 4 ) | UTF8_cval3 ); \
470 *( *toP )++ = ( ( ( hi & 0xf ) << 2 ) | ( lo >> 6 ) | 0x80 ); \
471 *( *toP )++ = ( ( lo & 0x3f ) | 0x80 ); \
472 break; \
473 case 0xD8: \
474 case 0xD9: \
475 case 0xDA: \
476 case 0xDB: \
477 if ( toLim - *toP < 4 ) \
478 { \
479 *fromP = from; \
480 return; \
481 } \
482 plane = ( ( ( hi & 0x3 ) << 2 ) | ( ( lo >> 6 ) & 0x3 ) ) + 1; \
483 *( *toP )++ = ( ( plane >> 2 ) | UTF8_cval4 ); \
484 *( *toP )++ = ( ( ( lo >> 2 ) & 0xF ) | ( ( plane & 0x3 ) << 4 ) | 0x80 ); \
485 from += 2; \
486 lo2 = GET_LO( from ); \
487 *( *toP )++ = ( ( ( lo & 0x3 ) << 4 ) | ( ( GET_HI( from ) & 0x3 ) << 2 ) | \
488 ( lo2 >> 6 ) | 0x80 ); \
489 *( *toP )++ = ( ( lo2 & 0x3f ) | 0x80 ); \
490 break; \
491 } \
492 } \
493 *fromP = from; \
494 }
495
496#define DEFINE_UTF16_TO_UTF16( E ) \
497 static void E##toUtf16( const ENCODING* enc, const char** fromP, const char* fromLim, \
498 unsigned short** toP, const unsigned short* toLim ) { \
499 /* Avoid copying first half only of surrogate */ \
500 if ( fromLim - *fromP > ( ( toLim - *toP ) << 1 ) && \
501 ( GET_HI( fromLim - 2 ) & 0xF8 ) == 0xD8 ) \
502 fromLim -= 2; \
503 for ( ; *fromP != fromLim && *toP != toLim; *fromP += 2 ) \
504 *( *toP )++ = ( GET_HI( *fromP ) << 8 ) | GET_LO( *fromP ); \
505 }
506
507#define SET2( ptr, ch ) \
508 ( ( ( ptr )[0] = ( ( ch ) & 0xff ) ), ( ( ptr )[1] = ( ( ch ) >> 8 ) ) )
509#define GET_LO( ptr ) ( (unsigned char)( ptr )[0] )
510#define GET_HI( ptr ) ( (unsigned char)( ptr )[1] )
511
512DEFINE_UTF16_TO_UTF8( little2_ )
513DEFINE_UTF16_TO_UTF16( little2_ )
514
515#undef SET2
516#undef GET_LO
517#undef GET_HI
518
519#define SET2( ptr, ch ) \
520 ( ( ( ptr )[0] = ( ( ch ) >> 8 ) ), ( ( ptr )[1] = ( ( ch ) & 0xFF ) ) )
521#define GET_LO( ptr ) ( (unsigned char)( ptr )[1] )
522#define GET_HI( ptr ) ( (unsigned char)( ptr )[0] )
523
526
527#undef SET2
528#undef GET_LO
529#undef GET_HI
530
531#define LITTLE2_BYTE_TYPE( enc, p ) \
532 ( ( p )[1] == 0 ? ( (struct normal_encoding*)( enc ) )->type[(unsigned char)*( p )] \
533 : unicode_byte_type( ( p )[1], ( p )[0] ) )
534#define LITTLE2_BYTE_TO_ASCII( enc, p ) ( ( p )[1] == 0 ? ( p )[0] : -1 )
535#define LITTLE2_CHAR_MATCHES( enc, p, c ) ( ( p )[1] == 0 && ( p )[0] == c )
536#define LITTLE2_IS_NAME_CHAR_MINBPC( enc, p ) \
537 UCS2_GET_NAMING( namePages, (unsigned char)p[1], (unsigned char)p[0] )
538#define LITTLE2_IS_NMSTRT_CHAR_MINBPC( enc, p ) \
539 UCS2_GET_NAMING( nmstrtPages, (unsigned char)p[1], (unsigned char)p[0] )
540
541#ifdef XML_MIN_SIZE
542
543static int little2_byteType( const ENCODING* enc, const char* p ) {
544 return LITTLE2_BYTE_TYPE( enc, p );
545}
546
547static int little2_byteToAscii( const ENCODING* enc, const char* p ) {
548 return LITTLE2_BYTE_TO_ASCII( enc, p );
549}
550
551static int little2_charMatches( const ENCODING* enc, const char* p, int c ) {
552 return LITTLE2_CHAR_MATCHES( enc, p, c );
553}
554
555static int little2_isNameMin( const ENCODING* enc, const char* p ) {
556 return LITTLE2_IS_NAME_CHAR_MINBPC( enc, p );
557}
558
559static int little2_isNmstrtMin( const ENCODING* enc, const char* p ) {
561}
562
563# undef VTABLE
564# define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
565
566#else /* not XML_MIN_SIZE */
567
568# undef PREFIX
569# define PREFIX( ident ) little2_##ident
570# define MINBPC( enc ) 2
571/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
572# define BYTE_TYPE( enc, p ) LITTLE2_BYTE_TYPE( enc, p )
573# define BYTE_TO_ASCII( enc, p ) LITTLE2_BYTE_TO_ASCII( enc, p )
574# define CHAR_MATCHES( enc, p, c ) LITTLE2_CHAR_MATCHES( enc, p, c )
575# define IS_NAME_CHAR( enc, p, n ) 0
576# define IS_NAME_CHAR_MINBPC( enc, p ) LITTLE2_IS_NAME_CHAR_MINBPC( enc, p )
577# define IS_NMSTRT_CHAR( enc, p, n ) ( 0 )
578# define IS_NMSTRT_CHAR_MINBPC( enc, p ) LITTLE2_IS_NMSTRT_CHAR_MINBPC( enc, p )
579
580# include "xmltok_impl.c"
581
582# undef MINBPC
583# undef BYTE_TYPE
584# undef BYTE_TO_ASCII
585# undef CHAR_MATCHES
586# undef IS_NAME_CHAR
587# undef IS_NAME_CHAR_MINBPC
588# undef IS_NMSTRT_CHAR
589# undef IS_NMSTRT_CHAR_MINBPC
590# undef IS_INVALID_CHAR
591
592#endif /* not XML_MIN_SIZE */
593
594#ifdef XML_NS
595
596static const struct normal_encoding little2_encoding_ns = { { VTABLE, 2, 0,
597# if XML_BYTE_ORDER == 12
598 1
599# else
600 0
601# endif
602 },
603 {
604# include "asciitab.h"
605# include "latin1tab.h"
606 },
607 STANDARD_VTABLE( little2_ ) };
608
609#endif
610
611static const struct normal_encoding little2_encoding = { { VTABLE, 2, 0,
612#if XML_BYTE_ORDER == 12
613 1
614#else
615 0
616#endif
617 },
618 {
619#define BT_COLON BT_NMSTRT
620#include "asciitab.h"
621#undef BT_COLON
622#include "latin1tab.h"
623 },
624 STANDARD_VTABLE( little2_ ) };
625
626#if XML_BYTE_ORDER != 21
627
628# ifdef XML_NS
629
630static const struct normal_encoding internal_little2_encoding_ns = {
631 { VTABLE, 2, 0, 1 },
632 {
633# include "iasciitab.h"
634# include "latin1tab.h"
635 },
636 STANDARD_VTABLE( little2_ ) };
637
638# endif
639
640static const struct normal_encoding internal_little2_encoding = {
641 { VTABLE, 2, 0, 1 },
642 {
643# define BT_COLON BT_NMSTRT
644# include "iasciitab.h"
645# undef BT_COLON
646# include "latin1tab.h"
647 },
648 STANDARD_VTABLE( little2_ ) };
649
650#endif
651
652#define BIG2_BYTE_TYPE( enc, p ) \
653 ( ( p )[0] == 0 ? ( (struct normal_encoding*)( enc ) )->type[(unsigned char)( p )[1]] \
654 : unicode_byte_type( ( p )[0], ( p )[1] ) )
655#define BIG2_BYTE_TO_ASCII( enc, p ) ( ( p )[0] == 0 ? ( p )[1] : -1 )
656#define BIG2_CHAR_MATCHES( enc, p, c ) ( ( p )[0] == 0 && ( p )[1] == c )
657#define BIG2_IS_NAME_CHAR_MINBPC( enc, p ) \
658 UCS2_GET_NAMING( namePages, (unsigned char)p[0], (unsigned char)p[1] )
659#define BIG2_IS_NMSTRT_CHAR_MINBPC( enc, p ) \
660 UCS2_GET_NAMING( nmstrtPages, (unsigned char)p[0], (unsigned char)p[1] )
661
662#ifdef XML_MIN_SIZE
663
664static int big2_byteType( const ENCODING* enc, const char* p ) {
665 return BIG2_BYTE_TYPE( enc, p );
666}
667
668static int big2_byteToAscii( const ENCODING* enc, const char* p ) {
669 return BIG2_BYTE_TO_ASCII( enc, p );
670}
671
672static int big2_charMatches( const ENCODING* enc, const char* p, int c ) {
673 return BIG2_CHAR_MATCHES( enc, p, c );
674}
675
676static int big2_isNameMin( const ENCODING* enc, const char* p ) {
677 return BIG2_IS_NAME_CHAR_MINBPC( enc, p );
678}
679
680static int big2_isNmstrtMin( const ENCODING* enc, const char* p ) {
681 return BIG2_IS_NMSTRT_CHAR_MINBPC( enc, p );
682}
683
684# undef VTABLE
685# define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
686
687#else /* not XML_MIN_SIZE */
688
689# undef PREFIX
690# define PREFIX( ident ) big2_##ident
691# define MINBPC( enc ) 2
692/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
693# define BYTE_TYPE( enc, p ) BIG2_BYTE_TYPE( enc, p )
694# define BYTE_TO_ASCII( enc, p ) BIG2_BYTE_TO_ASCII( enc, p )
695# define CHAR_MATCHES( enc, p, c ) BIG2_CHAR_MATCHES( enc, p, c )
696# define IS_NAME_CHAR( enc, p, n ) 0
697# define IS_NAME_CHAR_MINBPC( enc, p ) BIG2_IS_NAME_CHAR_MINBPC( enc, p )
698# define IS_NMSTRT_CHAR( enc, p, n ) ( 0 )
699# define IS_NMSTRT_CHAR_MINBPC( enc, p ) BIG2_IS_NMSTRT_CHAR_MINBPC( enc, p )
700
701# include "xmltok_impl.c"
702
703# undef MINBPC
704# undef BYTE_TYPE
705# undef BYTE_TO_ASCII
706# undef CHAR_MATCHES
707# undef IS_NAME_CHAR
708# undef IS_NAME_CHAR_MINBPC
709# undef IS_NMSTRT_CHAR
710# undef IS_NMSTRT_CHAR_MINBPC
711# undef IS_INVALID_CHAR
712
713#endif /* not XML_MIN_SIZE */
714
715#ifdef XML_NS
716
717static const struct normal_encoding big2_encoding_ns = { { VTABLE, 2, 0,
718# if XML_BYTE_ORDER == 21
719 1
720# else
721 0
722# endif
723 },
724 {
725# include "asciitab.h"
726# include "latin1tab.h"
727 },
728 STANDARD_VTABLE( big2_ ) };
729
730#endif
731
732static const struct normal_encoding big2_encoding = { { VTABLE, 2, 0,
733#if XML_BYTE_ORDER == 21
734 1
735#else
736 0
737#endif
738 },
739 {
740#define BT_COLON BT_NMSTRT
741#include "asciitab.h"
742#undef BT_COLON
743#include "latin1tab.h"
744 },
745 STANDARD_VTABLE( big2_ ) };
746
747#if XML_BYTE_ORDER != 12
748
749# ifdef XML_NS
750
751static const struct normal_encoding internal_big2_encoding_ns = { { VTABLE, 2, 0, 1 },
752 {
753# include "iasciitab.h"
754# include "latin1tab.h"
755 },
756 STANDARD_VTABLE( big2_ ) };
757
758# endif
759
760static const struct normal_encoding internal_big2_encoding = { { VTABLE, 2, 0, 1 },
761 {
762# define BT_COLON BT_NMSTRT
763# include "iasciitab.h"
764# undef BT_COLON
765# include "latin1tab.h"
766 },
767 STANDARD_VTABLE( big2_ ) };
768
769#endif
770
771#undef PREFIX
772
773static int streqci( const char* s1, const char* s2 ) {
774 for ( ;; )
775 {
776 char c1 = *s1++;
777 char c2 = *s2++;
778 if ( 'a' <= c1 && c1 <= 'z' ) c1 += 'A' - 'a';
779 if ( 'a' <= c2 && c2 <= 'z' ) c2 += 'A' - 'a';
780 if ( c1 != c2 ) return 0;
781 if ( !c1 ) break;
782 }
783 return 1;
784}
785
786static void initUpdatePosition( const ENCODING* enc, const char* ptr, const char* end,
787 POSITION* pos ) {
788 normal_updatePosition( &utf8_encoding.enc, ptr, end, pos );
789}
790
791static int toAscii( const ENCODING* enc, const char* ptr, const char* end ) {
792 char buf[1];
793 char* p = buf;
794 XmlUtf8Convert( enc, &ptr, end, &p, p + 1 );
795 if ( p == buf ) return -1;
796 else return buf[0];
797}
798
799static int isSpace( int c ) {
800 switch ( c )
801 {
802 case 0x20:
803 case 0xD:
804 case 0xA:
805 case 0x9: return 1;
806 }
807 return 0;
808}
809
810/* Return 1 if there's just optional white space
811or there's an S followed by name=val. */
812static int parsePseudoAttribute( const ENCODING* enc, const char* ptr, const char* end,
813 const char** namePtr, const char** valPtr,
814 const char** nextTokPtr ) {
815 int c;
816 char open;
817 if ( ptr == end )
818 {
819 *namePtr = 0;
820 return 1;
821 }
822 if ( !isSpace( toAscii( enc, ptr, end ) ) )
823 {
824 *nextTokPtr = ptr;
825 return 0;
826 }
827 do {
828 ptr += enc->minBytesPerChar;
829 } while ( isSpace( toAscii( enc, ptr, end ) ) );
830 if ( ptr == end )
831 {
832 *namePtr = 0;
833 return 1;
834 }
835 *namePtr = ptr;
836 for ( ;; )
837 {
838 c = toAscii( enc, ptr, end );
839 if ( c == -1 )
840 {
841 *nextTokPtr = ptr;
842 return 0;
843 }
844 if ( c == '=' ) break;
845 if ( isSpace( c ) )
846 {
847 do {
848 ptr += enc->minBytesPerChar;
849 } while ( isSpace( c = toAscii( enc, ptr, end ) ) );
850 if ( c != '=' )
851 {
852 *nextTokPtr = ptr;
853 return 0;
854 }
855 break;
856 }
857 ptr += enc->minBytesPerChar;
858 }
859 if ( ptr == *namePtr )
860 {
861 *nextTokPtr = ptr;
862 return 0;
863 }
864 ptr += enc->minBytesPerChar;
865 c = toAscii( enc, ptr, end );
866 while ( isSpace( c ) )
867 {
868 ptr += enc->minBytesPerChar;
869 c = toAscii( enc, ptr, end );
870 }
871 if ( c != '"' && c != '\'' )
872 {
873 *nextTokPtr = ptr;
874 return 0;
875 }
876 open = c;
877 ptr += enc->minBytesPerChar;
878 *valPtr = ptr;
879 for ( ;; ptr += enc->minBytesPerChar )
880 {
881 c = toAscii( enc, ptr, end );
882 if ( c == open ) break;
883 if ( !( 'a' <= c && c <= 'z' ) && !( 'A' <= c && c <= 'Z' ) && !( '0' <= c && c <= '9' ) &&
884 c != '.' && c != '-' && c != '_' )
885 {
886 *nextTokPtr = ptr;
887 return 0;
888 }
889 }
890 *nextTokPtr = ptr + enc->minBytesPerChar;
891 return 1;
892}
893
894static int
895doParseXmlDecl( const ENCODING* ( *encodingFinder )(const ENCODING*, const char*, const char*),
896 int isGeneralTextEntity, const ENCODING* enc, const char* ptr, const char* end,
897 const char** badPtr, const char** versionPtr, const char** encodingName,
898 const ENCODING** encoding, int* standalone ) {
899 const char* val = 0;
900 const char* name = 0;
901 ptr += 5 * enc->minBytesPerChar;
902 end -= 2 * enc->minBytesPerChar;
903 if ( !parsePseudoAttribute( enc, ptr, end, &name, &val, &ptr ) || !name )
904 {
905 *badPtr = ptr;
906 return 0;
907 }
908 if ( !XmlNameMatchesAscii( enc, name, "version" ) )
909 {
910 if ( !isGeneralTextEntity )
911 {
912 *badPtr = name;
913 return 0;
914 }
915 }
916 else
917 {
918 if ( versionPtr ) *versionPtr = val;
919 if ( !parsePseudoAttribute( enc, ptr, end, &name, &val, &ptr ) )
920 {
921 *badPtr = ptr;
922 return 0;
923 }
924 if ( !name )
925 {
926 if ( isGeneralTextEntity )
927 {
928 /* a TextDecl must have an EncodingDecl */
929 *badPtr = ptr;
930 return 0;
931 }
932 return 1;
933 }
934 }
935 if ( XmlNameMatchesAscii( enc, name, "encoding" ) )
936 {
937 int c = toAscii( enc, val, end );
938 if ( !( 'a' <= c && c <= 'z' ) && !( 'A' <= c && c <= 'Z' ) )
939 {
940 *badPtr = val;
941 return 0;
942 }
943 if ( encodingName ) *encodingName = val;
944 if ( encoding ) *encoding = encodingFinder( enc, val, ptr - enc->minBytesPerChar );
945 if ( !parsePseudoAttribute( enc, ptr, end, &name, &val, &ptr ) )
946 {
947 *badPtr = ptr;
948 return 0;
949 }
950 if ( !name ) return 1;
951 }
952 if ( !XmlNameMatchesAscii( enc, name, "standalone" ) || isGeneralTextEntity )
953 {
954 *badPtr = name;
955 return 0;
956 }
957 if ( XmlNameMatchesAscii( enc, val, "yes" ) )
958 {
959 if ( standalone ) *standalone = 1;
960 }
961 else if ( XmlNameMatchesAscii( enc, val, "no" ) )
962 {
963 if ( standalone ) *standalone = 0;
964 }
965 else
966 {
967 *badPtr = val;
968 return 0;
969 }
970 while ( isSpace( toAscii( enc, ptr, end ) ) ) ptr += enc->minBytesPerChar;
971 if ( ptr != end )
972 {
973 *badPtr = ptr;
974 return 0;
975 }
976 return 1;
977}
978
979static int checkCharRefNumber( int result ) {
980 switch ( result >> 8 )
981 {
982 case 0xD8:
983 case 0xD9:
984 case 0xDA:
985 case 0xDB:
986 case 0xDC:
987 case 0xDD:
988 case 0xDE:
989 case 0xDF: return -1;
990 case 0:
991 if ( latin1_encoding.type[result] == BT_NONXML ) return -1;
992 break;
993 case 0xFF:
994 if ( result == 0xFFFE || result == 0xFFFF ) return -1;
995 break;
996 }
997 return result;
998}
999
1000int XmlUtf8Encode( int c, char* buf ) {
1001 enum {
1002 /* minN is minimum legal resulting value for N byte sequence */
1003 min2 = 0x80,
1004 min3 = 0x800,
1005 min4 = 0x10000
1006 };
1007
1008 if ( c < 0 ) return 0;
1009 if ( c < min2 )
1010 {
1011 buf[0] = ( c | UTF8_cval1 );
1012 return 1;
1013 }
1014 if ( c < min3 )
1015 {
1016 buf[0] = ( ( c >> 6 ) | UTF8_cval2 );
1017 buf[1] = ( ( c & 0x3f ) | 0x80 );
1018 return 2;
1019 }
1020 if ( c < min4 )
1021 {
1022 buf[0] = ( ( c >> 12 ) | UTF8_cval3 );
1023 buf[1] = ( ( ( c >> 6 ) & 0x3f ) | 0x80 );
1024 buf[2] = ( ( c & 0x3f ) | 0x80 );
1025 return 3;
1026 }
1027 if ( c < 0x110000 )
1028 {
1029 buf[0] = ( ( c >> 18 ) | UTF8_cval4 );
1030 buf[1] = ( ( ( c >> 12 ) & 0x3f ) | 0x80 );
1031 buf[2] = ( ( ( c >> 6 ) & 0x3f ) | 0x80 );
1032 buf[3] = ( ( c & 0x3f ) | 0x80 );
1033 return 4;
1034 }
1035 return 0;
1036}
1037
1038int XmlUtf16Encode( int charNum, unsigned short* buf ) {
1039 if ( charNum < 0 ) return 0;
1040 if ( charNum < 0x10000 )
1041 {
1042 buf[0] = charNum;
1043 return 1;
1044 }
1045 if ( charNum < 0x110000 )
1046 {
1047 charNum -= 0x10000;
1048 buf[0] = ( charNum >> 10 ) + 0xD800;
1049 buf[1] = ( charNum & 0x3FF ) + 0xDC00;
1050 return 2;
1051 }
1052 return 0;
1053}
1054
1057 int ( *convert )( void* userData, const char* p );
1059 unsigned short utf16[256];
1060 char utf8[256][4];
1061};
1062
1063int XmlSizeOfUnknownEncoding() { return sizeof( struct unknown_encoding ); }
1064
1065static int unknown_isName( const ENCODING* enc, const char* p ) {
1066 int c = ( (const struct unknown_encoding*)enc )
1067 ->convert( ( (const struct unknown_encoding*)enc )->userData, p );
1068 if ( c & ~0xFFFF ) return 0;
1069 return UCS2_GET_NAMING( namePages, c >> 8, c & 0xFF );
1070}
1071
1072static int unknown_isNmstrt( const ENCODING* enc, const char* p ) {
1073 int c = ( (const struct unknown_encoding*)enc )
1074 ->convert( ( (const struct unknown_encoding*)enc )->userData, p );
1075 if ( c & ~0xFFFF ) return 0;
1076 return UCS2_GET_NAMING( nmstrtPages, c >> 8, c & 0xFF );
1077}
1078
1079static int unknown_isInvalid( const ENCODING* enc, const char* p ) {
1080 int c = ( (const struct unknown_encoding*)enc )
1081 ->convert( ( (const struct unknown_encoding*)enc )->userData, p );
1082 return ( c & ~0xFFFF ) || checkCharRefNumber( c ) < 0;
1083}
1084
1085static void unknown_toUtf8( const ENCODING* enc, const char** fromP, const char* fromLim,
1086 char** toP, const char* toLim ) {
1087 char buf[XML_UTF8_ENCODE_MAX];
1088 for ( ;; )
1089 {
1090 const char* utf8;
1091 int n;
1092 if ( *fromP == fromLim ) break;
1093 utf8 = ( (const struct unknown_encoding*)enc )->utf8[(unsigned char)**fromP];
1094 n = *utf8++;
1095 if ( n == 0 )
1096 {
1097 int c = ( (const struct unknown_encoding*)enc )
1098 ->convert( ( (const struct unknown_encoding*)enc )->userData, *fromP );
1099 n = XmlUtf8Encode( c, buf );
1100 if ( n > toLim - *toP ) break;
1101 utf8 = buf;
1102 *fromP += ( (const struct normal_encoding*)enc )->type[(unsigned char)**fromP] -
1103 ( BT_LEAD2 - 2 );
1104 }
1105 else
1106 {
1107 if ( n > toLim - *toP ) break;
1108 ( *fromP )++;
1109 }
1110 do {
1111 *( *toP )++ = *utf8++;
1112 } while ( --n != 0 );
1113 }
1114}
1115
1116static void unknown_toUtf16( const ENCODING* enc, const char** fromP, const char* fromLim,
1117 unsigned short** toP, const unsigned short* toLim ) {
1118 while ( *fromP != fromLim && *toP != toLim )
1119 {
1120 unsigned short c = ( (const struct unknown_encoding*)enc )->utf16[(unsigned char)**fromP];
1121 if ( c == 0 )
1122 {
1123 c = (unsigned short)( (const struct unknown_encoding*)enc )
1124 ->convert( ( (const struct unknown_encoding*)enc )->userData, *fromP );
1125 *fromP += ( (const struct normal_encoding*)enc )->type[(unsigned char)**fromP] -
1126 ( BT_LEAD2 - 2 );
1127 }
1128 else ( *fromP )++;
1129 *( *toP )++ = c;
1130 }
1131}
1132
1133ENCODING* XmlInitUnknownEncoding( void* mem, int* table,
1134 int ( *convert )( void* userData, const char* p ),
1135 void* userData ) {
1136 int i;
1137 struct unknown_encoding* e = mem;
1138 for ( i = 0; i < sizeof( struct normal_encoding ); i++ )
1139 ( (char*)mem )[i] = ( (char*)&latin1_encoding )[i];
1140 for ( i = 0; i < 128; i++ )
1141 if ( latin1_encoding.type[i] != BT_OTHER && latin1_encoding.type[i] != BT_NONXML &&
1142 table[i] != i )
1143 return 0;
1144 for ( i = 0; i < 256; i++ )
1145 {
1146 int c = table[i];
1147 if ( c == -1 )
1148 {
1149 e->normal.type[i] = BT_MALFORM;
1150 /* This shouldn't really get used. */
1151 e->utf16[i] = 0xFFFF;
1152 e->utf8[i][0] = 1;
1153 e->utf8[i][1] = 0;
1154 }
1155 else if ( c < 0 )
1156 {
1157 if ( c < -4 ) return 0;
1158 e->normal.type[i] = BT_LEAD2 - ( c + 2 );
1159 e->utf8[i][0] = 0;
1160 e->utf16[i] = 0;
1161 }
1162 else if ( c < 0x80 )
1163 {
1164 if ( latin1_encoding.type[c] != BT_OTHER && latin1_encoding.type[c] != BT_NONXML &&
1165 c != i )
1166 return 0;
1167 e->normal.type[i] = latin1_encoding.type[c];
1168 e->utf8[i][0] = 1;
1169 e->utf8[i][1] = (char)c;
1170 e->utf16[i] = c == 0 ? 0xFFFF : c;
1171 }
1172 else if ( checkCharRefNumber( c ) < 0 )
1173 {
1174 e->normal.type[i] = BT_NONXML;
1175 /* This shouldn't really get used. */
1176 e->utf16[i] = 0xFFFF;
1177 e->utf8[i][0] = 1;
1178 e->utf8[i][1] = 0;
1179 }
1180 else
1181 {
1182 if ( c > 0xFFFF ) return 0;
1183 if ( UCS2_GET_NAMING( nmstrtPages, c >> 8, c & 0xff ) ) e->normal.type[i] = BT_NMSTRT;
1184 else if ( UCS2_GET_NAMING( namePages, c >> 8, c & 0xff ) ) e->normal.type[i] = BT_NAME;
1185 else e->normal.type[i] = BT_OTHER;
1186 e->utf8[i][0] = (char)XmlUtf8Encode( c, e->utf8[i] + 1 );
1187 e->utf16[i] = c;
1188 }
1189 }
1190 e->userData = userData;
1191 e->convert = convert;
1192 if ( convert )
1193 {
1194 e->normal.isName2 = unknown_isName;
1195 e->normal.isName3 = unknown_isName;
1196 e->normal.isName4 = unknown_isName;
1197 e->normal.isNmstrt2 = unknown_isNmstrt;
1198 e->normal.isNmstrt3 = unknown_isNmstrt;
1199 e->normal.isNmstrt4 = unknown_isNmstrt;
1200 e->normal.isInvalid2 = unknown_isInvalid;
1201 e->normal.isInvalid3 = unknown_isInvalid;
1202 e->normal.isInvalid4 = unknown_isInvalid;
1203 }
1204 e->normal.enc.utf8Convert = unknown_toUtf8;
1205 e->normal.enc.utf16Convert = unknown_toUtf16;
1206 return &( e->normal.enc );
1207}
1208
1209/* If this enumeration is changed, getEncodingIndex and encodings
1210must also be changed. */
1211enum {
1219 /* must match encodingNames up to here */
1221};
1222
1223static int getEncodingIndex( const char* name ) {
1224 static const char* encodingNames[] = {
1225 "ISO-8859-1",
1226 "US-ASCII",
1227 "UTF-8",
1228 "UTF-16",
1229 "UTF-16BE"
1230 "UTF-16LE",
1231 };
1232 int i;
1233 if ( name == 0 ) return NO_ENC;
1234 for ( i = 0; i < sizeof( encodingNames ) / sizeof( encodingNames[0] ); i++ )
1235 if ( streqci( name, encodingNames[i] ) ) return i;
1236 return UNKNOWN_ENC;
1237}
1238
1239/* For binary compatibility, we store the index of the encoding specified
1240at initialization in the isUtf16 member. */
1241
1242#define INIT_ENC_INDEX( enc ) ( ( enc )->initEnc.isUtf16 )
1243
1244/* This is what detects the encoding.
1245encodingTable maps from encoding indices to encodings;
1246INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1247state is XML_CONTENT_STATE if we're parsing an external text entity,
1248and XML_PROLOG_STATE otherwise.
1249*/
1250
1251static int initScan( const ENCODING** encodingTable, const INIT_ENCODING* enc, int state,
1252 const char* ptr, const char* end, const char** nextTokPtr ) {
1253 const ENCODING** encPtr;
1254
1255 if ( ptr == end ) return XML_TOK_NONE;
1256 encPtr = enc->encPtr;
1257 if ( ptr + 1 == end )
1258 {
1259 /* only a single byte available for auto-detection */
1260 /* a well-formed document entity must have more than one byte */
1261 if ( state != XML_CONTENT_STATE ) return XML_TOK_PARTIAL;
1262 /* so we're parsing an external text entity... */
1263 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1264 switch ( INIT_ENC_INDEX( enc ) )
1265 {
1266 case UTF_16_ENC:
1267 case UTF_16LE_ENC:
1268 case UTF_16BE_ENC: return XML_TOK_PARTIAL;
1269 }
1270 switch ( (unsigned char)*ptr )
1271 {
1272 case 0xFE:
1273 case 0xFF:
1274 case 0xEF: /* possibly first byte of UTF-8 BOM */
1275 if ( INIT_ENC_INDEX( enc ) == ISO_8859_1_ENC && state == XML_CONTENT_STATE ) break;
1276 /* fall through */
1277 case 0x00:
1278 case 0x3C: return XML_TOK_PARTIAL;
1279 }
1280 }
1281 else
1282 {
1283 switch ( ( (unsigned char)ptr[0] << 8 ) | (unsigned char)ptr[1] )
1284 {
1285 case 0xFEFF:
1286 if ( INIT_ENC_INDEX( enc ) == ISO_8859_1_ENC && state == XML_CONTENT_STATE ) break;
1287 *nextTokPtr = ptr + 2;
1288 *encPtr = encodingTable[UTF_16BE_ENC];
1289 return XML_TOK_BOM;
1290 /* 00 3C is handled in the default case */
1291 case 0x3C00:
1292 if ( ( INIT_ENC_INDEX( enc ) == UTF_16BE_ENC || INIT_ENC_INDEX( enc ) == UTF_16_ENC ) &&
1293 state == XML_CONTENT_STATE )
1294 break;
1295 *encPtr = encodingTable[UTF_16LE_ENC];
1296 return XmlTok( *encPtr, state, ptr, end, nextTokPtr );
1297 case 0xFFFE:
1298 if ( INIT_ENC_INDEX( enc ) == ISO_8859_1_ENC && state == XML_CONTENT_STATE ) break;
1299 *nextTokPtr = ptr + 2;
1300 *encPtr = encodingTable[UTF_16LE_ENC];
1301 return XML_TOK_BOM;
1302 case 0xEFBB:
1303 /* Maybe a UTF-8 BOM (EF BB BF) */
1304 /* If there's an explicitly specified (external) encoding
1305 of ISO-8859-1 or some flavour of UTF-16
1306 and this is an external text entity,
1307 don't look for the BOM,
1308 because it might be a legal data. */
1309 if ( state == XML_CONTENT_STATE )
1310 {
1311 int e = INIT_ENC_INDEX( enc );
1312 if ( e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC )
1313 break;
1314 }
1315 if ( ptr + 2 == end ) return XML_TOK_PARTIAL;
1316 if ( (unsigned char)ptr[2] == 0xBF )
1317 {
1318 *encPtr = encodingTable[UTF_8_ENC];
1319 return XML_TOK_BOM;
1320 }
1321 break;
1322 default:
1323 if ( ptr[0] == '\0' )
1324 {
1325 /* 0 isn't a legal data character. Furthermore a document entity can only
1326 start with ASCII characters. So the only way this can fail to be big-endian
1327 UTF-16 if it it's an external parsed general entity that's labelled as
1328 UTF-16LE. */
1329 if ( state == XML_CONTENT_STATE && INIT_ENC_INDEX( enc ) == UTF_16LE_ENC ) break;
1330 *encPtr = encodingTable[UTF_16BE_ENC];
1331 return XmlTok( *encPtr, state, ptr, end, nextTokPtr );
1332 }
1333 else if ( ptr[1] == '\0' )
1334 {
1335 /* We could recover here in the case:
1336 - parsing an external entity
1337 - second byte is 0
1338 - no externally specified encoding
1339 - no encoding declaration
1340 by assuming UTF-16LE. But we don't, because this would mean when
1341 presented just with a single byte, we couldn't reliably determine
1342 whether we needed further bytes. */
1343 if ( state == XML_CONTENT_STATE ) break;
1344 *encPtr = encodingTable[UTF_16LE_ENC];
1345 return XmlTok( *encPtr, state, ptr, end, nextTokPtr );
1346 }
1347 break;
1348 }
1349 }
1350 *encPtr = encodingTable[INIT_ENC_INDEX( enc )];
1351 return XmlTok( *encPtr, state, ptr, end, nextTokPtr );
1352}
1353
1354#define NS( x ) x
1355#define ns( x ) x
1356#include "xmltok_ns.c"
1357#undef NS
1358#undef ns
1359
1360#ifdef XML_NS
1361
1362# define NS( x ) x##NS
1363# define ns( x ) x##_ns
1364
1365# include "xmltok_ns.c"
1366
1367# undef NS
1368# undef ns
1369
1370ENCODING* XmlInitUnknownEncodingNS( void* mem, int* table,
1371 int ( *convert )( void* userData, const char* p ),
1372 void* userData ) {
1373 ENCODING* enc = XmlInitUnknownEncoding( mem, table, convert, userData );
1374 if ( enc ) ( (struct normal_encoding*)enc )->type[':'] = BT_COLON;
1375 return enc;
1376}
1377
1378#endif /* XML_NS */
const Int_t n
m_outputFile open("YYYY/m_txt_dir/LumTau_XXXX.txt", ios_base::app)
int(* isInvalid4)(const ENCODING *, const char *)
Definition xmltok.c:134
int(* isInvalid2)(const ENCODING *, const char *)
Definition xmltok.c:132
unsigned char type[256]
Definition xmltok.c:118
int(* isNmstrt4)(const ENCODING *, const char *)
Definition xmltok.c:131
int(* isName3)(const ENCODING *, const char *)
Definition xmltok.c:127
int(* isName2)(const ENCODING *, const char *)
Definition xmltok.c:126
int(* isNmstrt2)(const ENCODING *, const char *)
Definition xmltok.c:129
int(* isName4)(const ENCODING *, const char *)
Definition xmltok.c:128
int(* isNmstrt3)(const ENCODING *, const char *)
Definition xmltok.c:130
int(* isInvalid3)(const ENCODING *, const char *)
Definition xmltok.c:133
ENCODING enc
Definition xmltok.c:117
unsigned short utf16[256]
Definition xmltok.c:1059
int(* convert)(void *userData, const char *p)
Definition xmltok.c:1057
void * userData
Definition xmltok.c:1058
char utf8[256][4]
Definition xmltok.c:1060
struct normal_encoding normal
Definition xmltok.c:1056
#define BIG2_BYTE_TO_ASCII(enc, p)
Definition xmltok.c:655
#define STANDARD_VTABLE(E)
Definition xmltok.c:144
#define VTABLE1
Definition xmltok.c:39
int XmlUtf8Encode(int c, char *buf)
Definition xmltok.c:1000
#define UTF8_GET_NAMING3(pages, byte)
Definition xmltok.c:64
#define BIG2_IS_NAME_CHAR_MINBPC(enc, p)
Definition xmltok.c:657
#define DEFINE_UTF16_TO_UTF8(E)
Definition xmltok.c:423
#define BIG2_BYTE_TYPE(enc, p)
Definition xmltok.c:652
int XmlSizeOfUnknownEncoding()
Definition xmltok.c:1063
ENCODING * XmlInitUnknownEncoding(void *mem, int *table, int(*convert)(void *userData, const char *p), void *userData)
Definition xmltok.c:1133
@ UTF8_cval4
Definition xmltok.c:230
@ UTF8_cval1
Definition xmltok.c:227
@ UTF8_cval2
Definition xmltok.c:228
@ UTF8_cval3
Definition xmltok.c:229
#define NORMAL_VTABLE(E)
Definition xmltok.c:148
#define INIT_ENC_INDEX(enc)
Definition xmltok.c:1242
int XmlUtf16Encode(int charNum, unsigned short *buf)
Definition xmltok.c:1038
#define SB_BYTE_TYPE(enc, p)
Definition xmltok.c:168
#define VTABLE
Definition xmltok.c:46
#define LITTLE2_CHAR_MATCHES(enc, p, c)
Definition xmltok.c:535
#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition xmltok.c:659
#define LITTLE2_BYTE_TYPE(enc, p)
Definition xmltok.c:531
#define BT_COLON
#define UTF8_INVALID4(p)
Definition xmltok.c:82
#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
Definition xmltok.c:536
#define UCS2_GET_NAMING(pages, hi, lo)
Definition xmltok.c:48
#define UTF8_GET_NAMING2(pages, byte)
Definition xmltok.c:55
#define LITTLE2_BYTE_TO_ASCII(enc, p)
Definition xmltok.c:534
@ NO_ENC
Definition xmltok.c:1220
@ US_ASCII_ENC
Definition xmltok.c:1214
@ ISO_8859_1_ENC
Definition xmltok.c:1213
@ UTF_8_ENC
Definition xmltok.c:1215
@ UTF_16_ENC
Definition xmltok.c:1216
@ UNKNOWN_ENC
Definition xmltok.c:1212
@ UTF_16BE_ENC
Definition xmltok.c:1217
@ UTF_16LE_ENC
Definition xmltok.c:1218
#define UTF8_INVALID3(p)
Definition xmltok.c:76
#define DEFINE_UTF16_TO_UTF16(E)
Definition xmltok.c:496
#define BIG2_CHAR_MATCHES(enc, p, c)
Definition xmltok.c:656
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition xmltok.c:538