Context Navigation

source: protocols/jabber/xmltok.c @ 9b63df6

Last change on this file since 9b63df6 was b7d3cc34, checked in by Wilmer van der Gaast <wilmer@…>, at 2005-11-06T18:23:18Z
Initial repository (0.99 release tree)
Property mode set to `100644`
File size: 40.0 KB

Line
1	/*
2	The contents of this file are subject to the Mozilla Public License
3	Version 1.1 (the "License"); you may not use this file except in
4	compliance with the License. You may obtain a copy of the License at
5	http://www.mozilla.org/MPL/
6
7	Software distributed under the License is distributed on an "AS IS"
8	basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9	License for the specific language governing rights and limitations
10	under the License.
11
12	The Original Code is expat.
13
14	The Initial Developer of the Original Code is James Clark.
15	Portions created by James Clark are Copyright (C) 1998, 1999
16	James Clark. All Rights Reserved.
17
18	Contributor(s):
19
20	*/
21
22	#include "xmldef.h"
23	#include "xmltok.h"
24	#include "nametab.h"
25
26	#define VTABLE1 \
27	{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
28	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
29	PREFIX(sameName), \
30	PREFIX(nameMatchesAscii), \
31	PREFIX(nameLength), \
32	PREFIX(skipS), \
33	PREFIX(getAtts), \
34	PREFIX(charRefNumber), \
35	PREFIX(predefinedEntityName), \
36	PREFIX(updatePosition), \
37	PREFIX(isPublicId)
38
39	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
40
41	#define UCS2_GET_NAMING(pages, hi, lo) \
42	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
43
44	/* A 2 byte UTF-8 representation splits the characters 11 bits
45	between the bottom 5 and 6 bits of the bytes.
46	We need 8 bits to index into pages, 3 bits to add to that index and
47	5 bits to generate the mask. */
48	#define UTF8_GET_NAMING2(pages, byte) \
49	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
50	+ ((((byte)[0]) & 3) << 1) \
51	+ ((((byte)[1]) >> 5) & 1)] \
52	& (1 << (((byte)[1]) & 0x1F)))
53
54	/* A 3 byte UTF-8 representation splits the characters 16 bits
55	between the bottom 4, 6 and 6 bits of the bytes.
56	We need 8 bits to index into pages, 3 bits to add to that index and
57	5 bits to generate the mask. */
58	#define UTF8_GET_NAMING3(pages, byte) \
59	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
60	+ ((((byte)[1]) >> 2) & 0xF)] \
61	<< 3) \
62	+ ((((byte)[1]) & 3) << 1) \
63	+ ((((byte)[2]) >> 5) & 1)] \
64	& (1 << (((byte)[2]) & 0x1F)))
65
66	#define UTF8_GET_NAMING(pages, p, n) \
67	((n) == 2 \
68	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
69	: ((n) == 3 \
70	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
71	: 0))
72
73	#define UTF8_INVALID3(p) \
74	((*p) == 0xED \
75	? (((p)[1] & 0x20) != 0) \
76	: ((*p) == 0xEF \
77	? ((p)[1] == 0xBF && ((p)[2] == 0xBF \|\| (p)[2] == 0xBE)) \
78	: 0))
79
80	#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
81
82	static
83	int isNever(const ENCODING enc, const char p)
84	{
85	return 0;
86	}
87
88	static
89	int utf8_isName2(const ENCODING enc, const char p)
90	{
91	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
92	}
93
94	static
95	int utf8_isName3(const ENCODING enc, const char p)
96	{
97	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
98	}
99
100	#define utf8_isName4 isNever
101
102	static
103	int utf8_isNmstrt2(const ENCODING enc, const char p)
104	{
105	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
106	}
107
108	static
109	int utf8_isNmstrt3(const ENCODING enc, const char p)
110	{
111	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
112	}
113
114	#define utf8_isNmstrt4 isNever
115
116	#define utf8_isInvalid2 isNever
117
118	static
119	int utf8_isInvalid3(const ENCODING enc, const char p)
120	{
121	return UTF8_INVALID3((const unsigned char *)p);
122	}
123
124	static
125	int utf8_isInvalid4(const ENCODING enc, const char p)
126	{
127	return UTF8_INVALID4((const unsigned char *)p);
128	}
129
130	struct normal_encoding {
131	ENCODING enc;
132	unsigned char type[256];
133	#ifdef XML_MIN_SIZE
134	int (byteType)(const ENCODING , const char *);
135	int (isNameMin)(const ENCODING , const char *);
136	int (isNmstrtMin)(const ENCODING , const char *);
137	int (byteToAscii)(const ENCODING , const char *);
138	int (charMatches)(const ENCODING , const char *, int);
139	#endif /* XML_MIN_SIZE */
140	int (isName2)(const ENCODING , const char *);
141	int (isName3)(const ENCODING , const char *);
142	int (isName4)(const ENCODING , const char *);
143	int (isNmstrt2)(const ENCODING , const char *);
144	int (isNmstrt3)(const ENCODING , const char *);
145	int (isNmstrt4)(const ENCODING , const char *);
146	int (isInvalid2)(const ENCODING , const char *);
147	int (isInvalid3)(const ENCODING , const char *);
148	int (isInvalid4)(const ENCODING , const char *);
149	};
150
151	#ifdef XML_MIN_SIZE
152
153	#define STANDARD_VTABLE(E) \
154	E ## byteType, \
155	E ## isNameMin, \
156	E ## isNmstrtMin, \
157	E ## byteToAscii, \
158	E ## charMatches,
159
160	#else
161
162	#define STANDARD_VTABLE(E) /* as nothing */
163
164	#endif
165
166	#define NORMAL_VTABLE(E) \
167	E ## isName2, \
168	E ## isName3, \
169	E ## isName4, \
170	E ## isNmstrt2, \
171	E ## isNmstrt3, \
172	E ## isNmstrt4, \
173	E ## isInvalid2, \
174	E ## isInvalid3, \
175	E ## isInvalid4
176
177	static int checkCharRefNumber(int);
178
179	#include "xmltok_impl.h"
180
181	#ifdef XML_MIN_SIZE
182	#define sb_isNameMin isNever
183	#define sb_isNmstrtMin isNever
184	#endif
185
186	#ifdef XML_MIN_SIZE
187	#define MINBPC(enc) ((enc)->minBytesPerChar)
188	#else
189	/* minimum bytes per character */
190	#define MINBPC(enc) 1
191	#endif
192
193	#define SB_BYTE_TYPE(enc, p) \
194	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
195
196	#ifdef XML_MIN_SIZE
197	static
198	int sb_byteType(const ENCODING enc, const char p)
199	{
200	return SB_BYTE_TYPE(enc, p);
201	}
202	#define BYTE_TYPE(enc, p) \
203	(((const struct normal_encoding *)(enc))->byteType(enc, p))
204	#else
205	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
206	#endif
207
208	#ifdef XML_MIN_SIZE
209	#define BYTE_TO_ASCII(enc, p) \
210	(((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
211	static
212	int sb_byteToAscii(const ENCODING enc, const char p)
213	{
214	return *p;
215	}
216	#else
217	#define BYTE_TO_ASCII(enc, p) (*p)
218	#endif
219
220	#define IS_NAME_CHAR(enc, p, n) \
221	(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
222	#define IS_NMSTRT_CHAR(enc, p, n) \
223	(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
224	#define IS_INVALID_CHAR(enc, p, n) \
225	(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
226
227	#ifdef XML_MIN_SIZE
228	#define IS_NAME_CHAR_MINBPC(enc, p) \
229	(((const struct normal_encoding *)(enc))->isNameMin(enc, p))
230	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
231	(((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
232	#else
233	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
234	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
235	#endif
236
237	#ifdef XML_MIN_SIZE
238	#define CHAR_MATCHES(enc, p, c) \
239	(((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
240	static
241	int sb_charMatches(const ENCODING enc, const char p, int c)
242	{
243	return *p == c;
244	}
245	#else
246	/* c is an ASCII character */
247	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
248	#endif
249
250	#define PREFIX(ident) normal_ ## ident
251	#include "xmltok_impl.c"
252
253	#undef MINBPC
254	#undef BYTE_TYPE
255	#undef BYTE_TO_ASCII
256	#undef CHAR_MATCHES
257	#undef IS_NAME_CHAR
258	#undef IS_NAME_CHAR_MINBPC
259	#undef IS_NMSTRT_CHAR
260	#undef IS_NMSTRT_CHAR_MINBPC
261	#undef IS_INVALID_CHAR
262
263	enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
264	UTF8_cval1 = 0x00,
265	UTF8_cval2 = 0xc0,
266	UTF8_cval3 = 0xe0,
267	UTF8_cval4 = 0xf0
268	};
269
270	static
271	void utf8_toUtf8(const ENCODING *enc,
272	const char *fromP, const char fromLim,
273	char *toP, const char toLim)
274	{
275	char *to;
276	const char *from;
277	if (fromLim - fromP > toLim - toP) {
278	/* Avoid copying partial characters. */
279	for (fromLim = fromP + (toLim - toP); fromLim > *fromP; fromLim--)
280	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
281	break;
282	}
283	for (to = toP, from = fromP; from != fromLim; from++, to++)
284	to = from;
285	*fromP = from;
286	*toP = to;
287	}
288
289	static
290	void utf8_toUtf16(const ENCODING *enc,
291	const char *fromP, const char fromLim,
292	unsigned short *toP, const unsigned short toLim)
293	{
294	unsigned short to = toP;
295	const char from = fromP;
296	while (from != fromLim && to != toLim) {
297	switch (((struct normal_encoding )enc)->type[(unsigned char)from]) {
298	case BT_LEAD2:
299	*to++ = ((from[0] & 0x1f) << 6) \| (from[1] & 0x3f);
300	from += 2;
301	break;
302	case BT_LEAD3:
303	*to++ = ((from[0] & 0xf) << 12) \| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f);
304	from += 3;
305	break;
306	case BT_LEAD4:
307	{
308	unsigned long n;
309	if (to + 1 == toLim)
310	break;
311	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12) \| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
312	n -= 0x10000;
313	to[0] = (unsigned short)((n >> 10) \| 0xD800);
314	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
315	to += 2;
316	from += 4;
317	}
318	break;
319	default:
320	to++ = from++;
321	break;
322	}
323	}
324	*fromP = from;
325	*toP = to;
326	}
327
328	#ifdef XML_NS
329	static const struct normal_encoding utf8_encoding_ns = {
330	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
331	{
332	#include "asciitab.h"
333	#include "utf8tab.h"
334	},
335	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
336	};
337	#endif
338
339	static const struct normal_encoding utf8_encoding = {
340	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
341	{
342	#define BT_COLON BT_NMSTRT
343	#include "asciitab.h"
344	#undef BT_COLON
345	#include "utf8tab.h"
346	},
347	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
348	};
349
350	#ifdef XML_NS
351
352	static const struct normal_encoding internal_utf8_encoding_ns = {
353	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
354	{
355	#include "iasciitab.h"
356	#include "utf8tab.h"
357	},
358	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
359	};
360
361	#endif
362
363	static const struct normal_encoding internal_utf8_encoding = {
364	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
365	{
366	#define BT_COLON BT_NMSTRT
367	#include "iasciitab.h"
368	#undef BT_COLON
369	#include "utf8tab.h"
370	},
371	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
372	};
373
374	static
375	void latin1_toUtf8(const ENCODING *enc,
376	const char *fromP, const char fromLim,
377	char *toP, const char toLim)
378	{
379	for (;;) {
380	unsigned char c;
381	if (*fromP == fromLim)
382	break;
383	c = (unsigned char)**fromP;
384	if (c & 0x80) {
385	if (toLim - *toP < 2)
386	break;
387	(toP)++ = ((c >> 6) \| UTF8_cval2);
388	(toP)++ = ((c & 0x3f) \| 0x80);
389	(*fromP)++;
390	}
391	else {
392	if (*toP == toLim)
393	break;
394	(toP)++ = (fromP)++;
395	}
396	}
397	}
398
399	static
400	void latin1_toUtf16(const ENCODING *enc,
401	const char *fromP, const char fromLim,
402	unsigned short *toP, const unsigned short toLim)
403	{
404	while (fromP != fromLim && toP != toLim)
405	(toP)++ = (unsigned char)(fromP)++;
406	}
407
408	#ifdef XML_NS
409
410	static const struct normal_encoding latin1_encoding_ns = {
411	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
412	{
413	#include "asciitab.h"
414	#include "latin1tab.h"
415	},
416	STANDARD_VTABLE(sb_)
417	};
418
419	#endif
420
421	static const struct normal_encoding latin1_encoding = {
422	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
423	{
424	#define BT_COLON BT_NMSTRT
425	#include "asciitab.h"
426	#undef BT_COLON
427	#include "latin1tab.h"
428	},
429	STANDARD_VTABLE(sb_)
430	};
431
432	static
433	void ascii_toUtf8(const ENCODING *enc,
434	const char *fromP, const char fromLim,
435	char *toP, const char toLim)
436	{
437	while (fromP != fromLim && toP != toLim)
438	(toP)++ = (fromP)++;
439	}
440
441	#ifdef XML_NS
442
443	static const struct normal_encoding ascii_encoding_ns = {
444	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
445	{
446	#include "asciitab.h"
447	/* BT_NONXML == 0 */
448	},
449	STANDARD_VTABLE(sb_)
450	};
451
452	#endif
453
454	static const struct normal_encoding ascii_encoding = {
455	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
456	{
457	#define BT_COLON BT_NMSTRT
458	#include "asciitab.h"
459	#undef BT_COLON
460	/* BT_NONXML == 0 */
461	},
462	STANDARD_VTABLE(sb_)
463	};
464
465	static int unicode_byte_type(char hi, char lo)
466	{
467	switch ((unsigned char)hi) {
468	case 0xD8: case 0xD9: case 0xDA: case 0xDB:
469	return BT_LEAD4;
470	case 0xDC: case 0xDD: case 0xDE: case 0xDF:
471	return BT_TRAIL;
472	case 0xFF:
473	switch ((unsigned char)lo) {
474	case 0xFF:
475	case 0xFE:
476	return BT_NONXML;
477	}
478	break;
479	}
480	return BT_NONASCII;
481	}
482
483	#define DEFINE_UTF16_TO_UTF8(E) \
484	static \
485	void E ## toUtf8(const ENCODING *enc, \
486	const char *fromP, const char fromLim, \
487	char *toP, const char toLim) \
488	{ \
489	const char *from; \
490	for (from = *fromP; from != fromLim; from += 2) { \
491	int plane; \
492	unsigned char lo2; \
493	unsigned char lo = GET_LO(from); \
494	unsigned char hi = GET_HI(from); \
495	switch (hi) { \
496	case 0: \
497	if (lo < 0x80) { \
498	if (*toP == toLim) { \
499	*fromP = from; \
500	return; \
501	} \
502	(toP)++ = lo; \
503	break; \
504	} \
505	/* fall through */ \
506	case 0x1: case 0x2: case 0x3: \
507	case 0x4: case 0x5: case 0x6: case 0x7: \
508	if (toLim - *toP < 2) { \
509	*fromP = from; \
510	return; \
511	} \
512	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
513	(toP)++ = ((lo & 0x3f) \| 0x80); \
514	break; \
515	default: \
516	if (toLim - *toP < 3) { \
517	*fromP = from; \
518	return; \
519	} \
520	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
521	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
522	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
523	(toP)++ = ((lo & 0x3f) \| 0x80); \
524	break; \
525	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
526	if (toLim - *toP < 4) { \
527	*fromP = from; \
528	return; \
529	} \
530	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
531	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
532	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
533	from += 2; \
534	lo2 = GET_LO(from); \
535	(toP)++ = (((lo & 0x3) << 4) \
536	\| ((GET_HI(from) & 0x3) << 2) \
537	\| (lo2 >> 6) \
538	\| 0x80); \
539	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
540	break; \
541	} \
542	} \
543	*fromP = from; \
544	}
545
546	#define DEFINE_UTF16_TO_UTF16(E) \
547	static \
548	void E ## toUtf16(const ENCODING *enc, \
549	const char *fromP, const char fromLim, \
550	unsigned short *toP, const unsigned short toLim) \
551	{ \
552	/* Avoid copying first half only of surrogate */ \
553	if (fromLim - fromP > ((toLim - toP) << 1) \
554	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
555	fromLim -= 2; \
556	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
557	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
558	}
559
560	#define SET2(ptr, ch) \
561	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
562	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
563	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
564
565	DEFINE_UTF16_TO_UTF8(little2_)
566	DEFINE_UTF16_TO_UTF16(little2_)
567
568	#undef SET2
569	#undef GET_LO
570	#undef GET_HI
571
572	#define SET2(ptr, ch) \
573	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
574	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
575	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
576
577	DEFINE_UTF16_TO_UTF8(big2_)
578	DEFINE_UTF16_TO_UTF16(big2_)
579
580	#undef SET2
581	#undef GET_LO
582	#undef GET_HI
583
584	#define LITTLE2_BYTE_TYPE(enc, p) \
585	((p)[1] == 0 \
586	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
587	: unicode_byte_type((p)[1], (p)[0]))
588	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
589	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
590	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
591	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
592	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
593	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
594
595	#ifdef XML_MIN_SIZE
596
597	static
598	int little2_byteType(const ENCODING enc, const char p)
599	{
600	return LITTLE2_BYTE_TYPE(enc, p);
601	}
602
603	static
604	int little2_byteToAscii(const ENCODING enc, const char p)
605	{
606	return LITTLE2_BYTE_TO_ASCII(enc, p);
607	}
608
609	static
610	int little2_charMatches(const ENCODING enc, const char p, int c)
611	{
612	return LITTLE2_CHAR_MATCHES(enc, p, c);
613	}
614
615	static
616	int little2_isNameMin(const ENCODING enc, const char p)
617	{
618	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
619	}
620
621	static
622	int little2_isNmstrtMin(const ENCODING enc, const char p)
623	{
624	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
625	}
626
627	#undef VTABLE
628	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
629
630	#else /* not XML_MIN_SIZE */
631
632	#undef PREFIX
633	#define PREFIX(ident) little2_ ## ident
634	#define MINBPC(enc) 2
635	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
636	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
637	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
638	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
639	#define IS_NAME_CHAR(enc, p, n) 0
640	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
641	#define IS_NMSTRT_CHAR(enc, p, n) (0)
642	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
643
644	#include "xmltok_impl.c"
645
646	#undef MINBPC
647	#undef BYTE_TYPE
648	#undef BYTE_TO_ASCII
649	#undef CHAR_MATCHES
650	#undef IS_NAME_CHAR
651	#undef IS_NAME_CHAR_MINBPC
652	#undef IS_NMSTRT_CHAR
653	#undef IS_NMSTRT_CHAR_MINBPC
654	#undef IS_INVALID_CHAR
655
656	#endif /* not XML_MIN_SIZE */
657
658	#ifdef XML_NS
659
660	static const struct normal_encoding little2_encoding_ns = {
661	{ VTABLE, 2, 0,
662	#if XML_BYTE_ORDER == 12
663	1
664	#else
665	0
666	#endif
667	},
668	{
669	#include "asciitab.h"
670	#include "latin1tab.h"
671	},
672	STANDARD_VTABLE(little2_)
673	};
674
675	#endif
676
677	static const struct normal_encoding little2_encoding = {
678	{ VTABLE, 2, 0,
679	#if XML_BYTE_ORDER == 12
680	1
681	#else
682	0
683	#endif
684	},
685	{
686	#define BT_COLON BT_NMSTRT
687	#include "asciitab.h"
688	#undef BT_COLON
689	#include "latin1tab.h"
690	},
691	STANDARD_VTABLE(little2_)
692	};
693
694	#if XML_BYTE_ORDER != 21
695
696	#ifdef XML_NS
697
698	static const struct normal_encoding internal_little2_encoding_ns = {
699	{ VTABLE, 2, 0, 1 },
700	{
701	#include "iasciitab.h"
702	#include "latin1tab.h"
703	},
704	STANDARD_VTABLE(little2_)
705	};
706
707	#endif
708
709	static const struct normal_encoding internal_little2_encoding = {
710	{ VTABLE, 2, 0, 1 },
711	{
712	#define BT_COLON BT_NMSTRT
713	#include "iasciitab.h"
714	#undef BT_COLON
715	#include "latin1tab.h"
716	},
717	STANDARD_VTABLE(little2_)
718	};
719
720	#endif
721
722
723	#define BIG2_BYTE_TYPE(enc, p) \
724	((p)[0] == 0 \
725	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
726	: unicode_byte_type((p)[0], (p)[1]))
727	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
728	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
729	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
730	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
731	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
732	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
733
734	#ifdef XML_MIN_SIZE
735
736	static
737	int big2_byteType(const ENCODING enc, const char p)
738	{
739	return BIG2_BYTE_TYPE(enc, p);
740	}
741
742	static
743	int big2_byteToAscii(const ENCODING enc, const char p)
744	{
745	return BIG2_BYTE_TO_ASCII(enc, p);
746	}
747
748	static
749	int big2_charMatches(const ENCODING enc, const char p, int c)
750	{
751	return BIG2_CHAR_MATCHES(enc, p, c);
752	}
753
754	static
755	int big2_isNameMin(const ENCODING enc, const char p)
756	{
757	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
758	}
759
760	static
761	int big2_isNmstrtMin(const ENCODING enc, const char p)
762	{
763	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
764	}
765
766	#undef VTABLE
767	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
768
769	#else /* not XML_MIN_SIZE */
770
771	#undef PREFIX
772	#define PREFIX(ident) big2_ ## ident
773	#define MINBPC(enc) 2
774	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
775	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
776	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
777	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
778	#define IS_NAME_CHAR(enc, p, n) 0
779	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
780	#define IS_NMSTRT_CHAR(enc, p, n) (0)
781	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
782
783	#include "xmltok_impl.c"
784
785	#undef MINBPC
786	#undef BYTE_TYPE
787	#undef BYTE_TO_ASCII
788	#undef CHAR_MATCHES
789	#undef IS_NAME_CHAR
790	#undef IS_NAME_CHAR_MINBPC
791	#undef IS_NMSTRT_CHAR
792	#undef IS_NMSTRT_CHAR_MINBPC
793	#undef IS_INVALID_CHAR
794
795	#endif /* not XML_MIN_SIZE */
796
797	#ifdef XML_NS
798
799	static const struct normal_encoding big2_encoding_ns = {
800	{ VTABLE, 2, 0,
801	#if XML_BYTE_ORDER == 21
802	1
803	#else
804	0
805	#endif
806	},
807	{
808	#include "asciitab.h"
809	#include "latin1tab.h"
810	},
811	STANDARD_VTABLE(big2_)
812	};
813
814	#endif
815
816	static const struct normal_encoding big2_encoding = {
817	{ VTABLE, 2, 0,
818	#if XML_BYTE_ORDER == 21
819	1
820	#else
821	0
822	#endif
823	},
824	{
825	#define BT_COLON BT_NMSTRT
826	#include "asciitab.h"
827	#undef BT_COLON
828	#include "latin1tab.h"
829	},
830	STANDARD_VTABLE(big2_)
831	};
832
833	#if XML_BYTE_ORDER != 12
834
835	#ifdef XML_NS
836
837	static const struct normal_encoding internal_big2_encoding_ns = {
838	{ VTABLE, 2, 0, 1 },
839	{
840	#include "iasciitab.h"
841	#include "latin1tab.h"
842	},
843	STANDARD_VTABLE(big2_)
844	};
845
846	#endif
847
848	static const struct normal_encoding internal_big2_encoding = {
849	{ VTABLE, 2, 0, 1 },
850	{
851	#define BT_COLON BT_NMSTRT
852	#include "iasciitab.h"
853	#undef BT_COLON
854	#include "latin1tab.h"
855	},
856	STANDARD_VTABLE(big2_)
857	};
858
859	#endif
860
861	#undef PREFIX
862
863	static
864	int streqci(const char s1, const char s2)
865	{
866	for (;;) {
867	char c1 = *s1++;
868	char c2 = *s2++;
869	if ('a' <= c1 && c1 <= 'z')
870	c1 += 'A' - 'a';
871	if ('a' <= c2 && c2 <= 'z')
872	c2 += 'A' - 'a';
873	if (c1 != c2)
874	return 0;
875	if (!c1)
876	break;
877	}
878	return 1;
879	}
880
881	static
882	void initUpdatePosition(const ENCODING enc, const char ptr,
883	const char end, POSITION pos)
884	{
885	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
886	}
887
888	static
889	int toAscii(const ENCODING enc, const char ptr, const char *end)
890	{
891	char buf[1];
892	char *p = buf;
893	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
894	if (p == buf)
895	return -1;
896	else
897	return buf[0];
898	}
899
900	static
901	int isSpace(int c)
902	{
903	switch (c) {
904	case 0x20:
905	case 0xD:
906	case 0xA:
907	case 0x9:
908	return 1;
909	}
910	return 0;
911	}
912
913	/* Return 1 if there's just optional white space
914	or there's an S followed by name=val. */
915	static
916	int parsePseudoAttribute(const ENCODING *enc,
917	const char *ptr,
918	const char *end,
919	const char **namePtr,
920	const char **valPtr,
921	const char **nextTokPtr)
922	{
923	int c;
924	char open;
925	if (ptr == end) {
926	*namePtr = 0;
927	return 1;
928	}
929	if (!isSpace(toAscii(enc, ptr, end))) {
930	*nextTokPtr = ptr;
931	return 0;
932	}
933	do {
934	ptr += enc->minBytesPerChar;
935	} while (isSpace(toAscii(enc, ptr, end)));
936	if (ptr == end) {
937	*namePtr = 0;
938	return 1;
939	}
940	*namePtr = ptr;
941	for (;;) {
942	c = toAscii(enc, ptr, end);
943	if (c == -1) {
944	*nextTokPtr = ptr;
945	return 0;
946	}
947	if (c == '=')
948	break;
949	if (isSpace(c)) {
950	do {
951	ptr += enc->minBytesPerChar;
952	} while (isSpace(c = toAscii(enc, ptr, end)));
953	if (c != '=') {
954	*nextTokPtr = ptr;
955	return 0;
956	}
957	break;
958	}
959	ptr += enc->minBytesPerChar;
960	}
961	if (ptr == *namePtr) {
962	*nextTokPtr = ptr;
963	return 0;
964	}
965	ptr += enc->minBytesPerChar;
966	c = toAscii(enc, ptr, end);
967	while (isSpace(c)) {
968	ptr += enc->minBytesPerChar;
969	c = toAscii(enc, ptr, end);
970	}
971	if (c != '"' && c != '\'') {
972	*nextTokPtr = ptr;
973	return 0;
974	}
975	open = c;
976	ptr += enc->minBytesPerChar;
977	*valPtr = ptr;
978	for (;; ptr += enc->minBytesPerChar) {
979	c = toAscii(enc, ptr, end);
980	if (c == open)
981	break;
982	if (!('a' <= c && c <= 'z')
983	&& !('A' <= c && c <= 'Z')
984	&& !('0' <= c && c <= '9')
985	&& c != '.'
986	&& c != '-'
987	&& c != '_') {
988	*nextTokPtr = ptr;
989	return 0;
990	}
991	}
992	*nextTokPtr = ptr + enc->minBytesPerChar;
993	return 1;
994	}
995
996	static
997	int doParseXmlDecl(const ENCODING (encodingFinder)(const ENCODING *,
998	const char *,
999	const char *),
1000	int isGeneralTextEntity,
1001	const ENCODING *enc,
1002	const char *ptr,
1003	const char *end,
1004	const char **badPtr,
1005	const char **versionPtr,
1006	const char **encodingName,
1007	const ENCODING **encoding,
1008	int *standalone)
1009	{
1010	const char *val = 0;
1011	const char *name = 0;
1012	ptr += 5 * enc->minBytesPerChar;
1013	end -= 2 * enc->minBytesPerChar;
1014	if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) \|\| !name) {
1015	*badPtr = ptr;
1016	return 0;
1017	}
1018	if (!XmlNameMatchesAscii(enc, name, "version")) {
1019	if (!isGeneralTextEntity) {
1020	*badPtr = name;
1021	return 0;
1022	}
1023	}
1024	else {
1025	if (versionPtr)
1026	*versionPtr = val;
1027	if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1028	*badPtr = ptr;
1029	return 0;
1030	}
1031	if (!name) {
1032	if (isGeneralTextEntity) {
1033	/* a TextDecl must have an EncodingDecl */
1034	*badPtr = ptr;
1035	return 0;
1036	}
1037	return 1;
1038	}
1039	}
1040	if (XmlNameMatchesAscii(enc, name, "encoding")) {
1041	int c = toAscii(enc, val, end);
1042	if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
1043	*badPtr = val;
1044	return 0;
1045	}
1046	if (encodingName)
1047	*encodingName = val;
1048	if (encoding)
1049	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1050	if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1051	*badPtr = ptr;
1052	return 0;
1053	}
1054	if (!name)
1055	return 1;
1056	}
1057	if (!XmlNameMatchesAscii(enc, name, "standalone") \|\| isGeneralTextEntity) {
1058	*badPtr = name;
1059	return 0;
1060	}
1061	if (XmlNameMatchesAscii(enc, val, "yes")) {
1062	if (standalone)
1063	*standalone = 1;
1064	}
1065	else if (XmlNameMatchesAscii(enc, val, "no")) {
1066	if (standalone)
1067	*standalone = 0;
1068	}
1069	else {
1070	*badPtr = val;
1071	return 0;
1072	}
1073	while (isSpace(toAscii(enc, ptr, end)))
1074	ptr += enc->minBytesPerChar;
1075	if (ptr != end) {
1076	*badPtr = ptr;
1077	return 0;
1078	}
1079	return 1;
1080	}
1081
1082	static
1083	int checkCharRefNumber(int result)
1084	{
1085	switch (result >> 8) {
1086	case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1087	case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1088	return -1;
1089	case 0:
1090	if (latin1_encoding.type[result] == BT_NONXML)
1091	return -1;
1092	break;
1093	case 0xFF:
1094	if (result == 0xFFFE \|\| result == 0xFFFF)
1095	return -1;
1096	break;
1097	}
1098	return result;
1099	}
1100
1101	int XmlUtf8Encode(int c, char *buf)
1102	{
1103	enum {
1104	/* minN is minimum legal resulting value for N byte sequence */
1105	min2 = 0x80,
1106	min3 = 0x800,
1107	min4 = 0x10000
1108	};
1109
1110	if (c < 0)
1111	return 0;
1112	if (c < min2) {
1113	buf[0] = (c \| UTF8_cval1);
1114	return 1;
1115	}
1116	if (c < min3) {
1117	buf[0] = ((c >> 6) \| UTF8_cval2);
1118	buf[1] = ((c & 0x3f) \| 0x80);
1119	return 2;
1120	}
1121	if (c < min4) {
1122	buf[0] = ((c >> 12) \| UTF8_cval3);
1123	buf[1] = (((c >> 6) & 0x3f) \| 0x80);
1124	buf[2] = ((c & 0x3f) \| 0x80);
1125	return 3;
1126	}
1127	if (c < 0x110000) {
1128	buf[0] = ((c >> 18) \| UTF8_cval4);
1129	buf[1] = (((c >> 12) & 0x3f) \| 0x80);
1130	buf[2] = (((c >> 6) & 0x3f) \| 0x80);
1131	buf[3] = ((c & 0x3f) \| 0x80);
1132	return 4;
1133	}
1134	return 0;
1135	}
1136
1137	int XmlUtf16Encode(int charNum, unsigned short *buf)
1138	{
1139	if (charNum < 0)
1140	return 0;
1141	if (charNum < 0x10000) {
1142	buf[0] = charNum;
1143	return 1;
1144	}
1145	if (charNum < 0x110000) {
1146	charNum -= 0x10000;
1147	buf[0] = (charNum >> 10) + 0xD800;
1148	buf[1] = (charNum & 0x3FF) + 0xDC00;
1149	return 2;
1150	}
1151	return 0;
1152	}
1153
1154	struct unknown_encoding {
1155	struct normal_encoding normal;
1156	int (convert)(void userData, const char *p);
1157	void *userData;
1158	unsigned short utf16[256];
1159	char utf8[256][4];
1160	};
1161
1162	int XmlSizeOfUnknownEncoding()
1163	{
1164	return sizeof(struct unknown_encoding);
1165	}
1166
1167	static
1168	int unknown_isName(const ENCODING enc, const char p)
1169	{
1170	int c = ((const struct unknown_encoding *)enc)
1171	->convert(((const struct unknown_encoding *)enc)->userData, p);
1172	if (c & ~0xFFFF)
1173	return 0;
1174	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1175	}
1176
1177	static
1178	int unknown_isNmstrt(const ENCODING enc, const char p)
1179	{
1180	int c = ((const struct unknown_encoding *)enc)
1181	->convert(((const struct unknown_encoding *)enc)->userData, p);
1182	if (c & ~0xFFFF)
1183	return 0;
1184	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1185	}
1186
1187	static
1188	int unknown_isInvalid(const ENCODING enc, const char p)
1189	{
1190	int c = ((const struct unknown_encoding *)enc)
1191	->convert(((const struct unknown_encoding *)enc)->userData, p);
1192	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
1193	}
1194
1195	static
1196	void unknown_toUtf8(const ENCODING *enc,
1197	const char *fromP, const char fromLim,
1198	char *toP, const char toLim)
1199	{
1200	char buf[XML_UTF8_ENCODE_MAX];
1201	for (;;) {
1202	const char *utf8;
1203	int n;
1204	if (*fromP == fromLim)
1205	break;
1206	utf8 = ((const struct unknown_encoding )enc)->utf8[(unsigned char)*fromP];
1207	n = *utf8++;
1208	if (n == 0) {
1209	int c = ((const struct unknown_encoding *)enc)
1210	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1211	n = XmlUtf8Encode(c, buf);
1212	if (n > toLim - *toP)
1213	break;
1214	utf8 = buf;
1215	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1216	- (BT_LEAD2 - 2);
1217	}
1218	else {
1219	if (n > toLim - *toP)
1220	break;
1221	(*fromP)++;
1222	}
1223	do {
1224	(toP)++ = *utf8++;
1225	} while (--n != 0);
1226	}
1227	}
1228
1229	static
1230	void unknown_toUtf16(const ENCODING *enc,
1231	const char *fromP, const char fromLim,
1232	unsigned short *toP, const unsigned short toLim)
1233	{
1234	while (fromP != fromLim && toP != toLim) {
1235	unsigned short c
1236	= ((const struct unknown_encoding )enc)->utf16[(unsigned char)*fromP];
1237	if (c == 0) {
1238	c = (unsigned short)((const struct unknown_encoding *)enc)
1239	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1240	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1241	- (BT_LEAD2 - 2);
1242	}
1243	else
1244	(*fromP)++;
1245	(toP)++ = c;
1246	}
1247	}
1248
1249	ENCODING *
1250	XmlInitUnknownEncoding(void *mem,
1251	int *table,
1252	int (convert)(void userData, const char *p),
1253	void *userData)
1254	{
1255	int i;
1256	struct unknown_encoding *e = mem;
1257	for (i = 0; i < sizeof(struct normal_encoding); i++)
1258	((char )mem)[i] = ((char )&latin1_encoding)[i];
1259	for (i = 0; i < 128; i++)
1260	if (latin1_encoding.type[i] != BT_OTHER
1261	&& latin1_encoding.type[i] != BT_NONXML
1262	&& table[i] != i)
1263	return 0;
1264	for (i = 0; i < 256; i++) {
1265	int c = table[i];
1266	if (c == -1) {
1267	e->normal.type[i] = BT_MALFORM;
1268	/* This shouldn't really get used. */
1269	e->utf16[i] = 0xFFFF;
1270	e->utf8[i][0] = 1;
1271	e->utf8[i][1] = 0;
1272	}
1273	else if (c < 0) {
1274	if (c < -4)
1275	return 0;
1276	e->normal.type[i] = BT_LEAD2 - (c + 2);
1277	e->utf8[i][0] = 0;
1278	e->utf16[i] = 0;
1279	}
1280	else if (c < 0x80) {
1281	if (latin1_encoding.type[c] != BT_OTHER
1282	&& latin1_encoding.type[c] != BT_NONXML
1283	&& c != i)
1284	return 0;
1285	e->normal.type[i] = latin1_encoding.type[c];
1286	e->utf8[i][0] = 1;
1287	e->utf8[i][1] = (char)c;
1288	e->utf16[i] = c == 0 ? 0xFFFF : c;
1289	}
1290	else if (checkCharRefNumber(c) < 0) {
1291	e->normal.type[i] = BT_NONXML;
1292	/* This shouldn't really get used. */
1293	e->utf16[i] = 0xFFFF;
1294	e->utf8[i][0] = 1;
1295	e->utf8[i][1] = 0;
1296	}
1297	else {
1298	if (c > 0xFFFF)
1299	return 0;
1300	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1301	e->normal.type[i] = BT_NMSTRT;
1302	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1303	e->normal.type[i] = BT_NAME;
1304	else
1305	e->normal.type[i] = BT_OTHER;
1306	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1307	e->utf16[i] = c;
1308	}
1309	}
1310	e->userData = userData;
1311	e->convert = convert;
1312	if (convert) {
1313	e->normal.isName2 = unknown_isName;
1314	e->normal.isName3 = unknown_isName;
1315	e->normal.isName4 = unknown_isName;
1316	e->normal.isNmstrt2 = unknown_isNmstrt;
1317	e->normal.isNmstrt3 = unknown_isNmstrt;
1318	e->normal.isNmstrt4 = unknown_isNmstrt;
1319	e->normal.isInvalid2 = unknown_isInvalid;
1320	e->normal.isInvalid3 = unknown_isInvalid;
1321	e->normal.isInvalid4 = unknown_isInvalid;
1322	}
1323	e->normal.enc.utf8Convert = unknown_toUtf8;
1324	e->normal.enc.utf16Convert = unknown_toUtf16;
1325	return &(e->normal.enc);
1326	}
1327
1328	/* If this enumeration is changed, getEncodingIndex and encodings
1329	must also be changed. */
1330	enum {
1331	UNKNOWN_ENC = -1,
1332	ISO_8859_1_ENC = 0,
1333	US_ASCII_ENC,
1334	UTF_8_ENC,
1335	UTF_16_ENC,
1336	UTF_16BE_ENC,
1337	UTF_16LE_ENC,
1338	/* must match encodingNames up to here */
1339	NO_ENC
1340	};
1341
1342	static
1343	int getEncodingIndex(const char *name)
1344	{
1345	static const char *encodingNames[] = {
1346	"ISO-8859-1",
1347	"US-ASCII",
1348	"UTF-8",
1349	"UTF-16",
1350	"UTF-16BE"
1351	"UTF-16LE",
1352	};
1353	int i;
1354	if (name == 0)
1355	return NO_ENC;
1356	for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
1357	if (streqci(name, encodingNames[i]))
1358	return i;
1359	return UNKNOWN_ENC;
1360	}
1361
1362	/* For binary compatibility, we store the index of the encoding specified
1363	at initialization in the isUtf16 member. */
1364
1365	#define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
1366
1367	/* This is what detects the encoding.
1368	encodingTable maps from encoding indices to encodings;
1369	INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1370	state is XML_CONTENT_STATE if we're parsing an external text entity,
1371	and XML_PROLOG_STATE otherwise.
1372	*/
1373
1374
1375	static
1376	int initScan(const ENCODING **encodingTable,
1377	const INIT_ENCODING *enc,
1378	int state,
1379	const char *ptr,
1380	const char *end,
1381	const char **nextTokPtr)
1382	{
1383	const ENCODING **encPtr;
1384
1385	if (ptr == end)
1386	return XML_TOK_NONE;
1387	encPtr = enc->encPtr;
1388	if (ptr + 1 == end) {
1389	/* only a single byte available for auto-detection */
1390	/* a well-formed document entity must have more than one byte */
1391	if (state != XML_CONTENT_STATE)
1392	return XML_TOK_PARTIAL;
1393	/* so we're parsing an external text entity... */
1394	/* if UTF-16 was externally specified, then we need at least 2 bytes */
1395	switch (INIT_ENC_INDEX(enc)) {
1396	case UTF_16_ENC:
1397	case UTF_16LE_ENC:
1398	case UTF_16BE_ENC:
1399	return XML_TOK_PARTIAL;
1400	}
1401	switch ((unsigned char)*ptr) {
1402	case 0xFE:
1403	case 0xFF:
1404	case 0xEF: /* possibly first byte of UTF-8 BOM */
1405	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1406	&& state == XML_CONTENT_STATE)
1407	break;
1408	/* fall through */
1409	case 0x00:
1410	case 0x3C:
1411	return XML_TOK_PARTIAL;
1412	}
1413	}
1414	else {
1415	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1]) {
1416	case 0xFEFF:
1417	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1418	&& state == XML_CONTENT_STATE)
1419	break;
1420	*nextTokPtr = ptr + 2;
1421	*encPtr = encodingTable[UTF_16BE_ENC];
1422	return XML_TOK_BOM;
1423	/* 00 3C is handled in the default case */
1424	case 0x3C00:
1425	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1426	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1427	&& state == XML_CONTENT_STATE)
1428	break;
1429	*encPtr = encodingTable[UTF_16LE_ENC];
1430	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1431	case 0xFFFE:
1432	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1433	&& state == XML_CONTENT_STATE)
1434	break;
1435	*nextTokPtr = ptr + 2;
1436	*encPtr = encodingTable[UTF_16LE_ENC];
1437	return XML_TOK_BOM;
1438	case 0xEFBB:
1439	/* Maybe a UTF-8 BOM (EF BB BF) */
1440	/* If there's an explicitly specified (external) encoding
1441	of ISO-8859-1 or some flavour of UTF-16
1442	and this is an external text entity,
1443	don't look for the BOM,
1444	because it might be a legal data. */
1445	if (state == XML_CONTENT_STATE) {
1446	int e = INIT_ENC_INDEX(enc);
1447	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC \|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
1448	break;
1449	}
1450	if (ptr + 2 == end)
1451	return XML_TOK_PARTIAL;
1452	if ((unsigned char)ptr[2] == 0xBF) {
1453	*encPtr = encodingTable[UTF_8_ENC];
1454	return XML_TOK_BOM;
1455	}
1456	break;
1457	default:
1458	if (ptr[0] == '\0') {
1459	/* 0 isn't a legal data character. Furthermore a document entity can only
1460	start with ASCII characters. So the only way this can fail to be big-endian
1461	UTF-16 if it it's an external parsed general entity that's labelled as
1462	UTF-16LE. */
1463	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1464	break;
1465	*encPtr = encodingTable[UTF_16BE_ENC];
1466	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1467	}
1468	else if (ptr[1] == '\0') {
1469	/* We could recover here in the case:
1470	- parsing an external entity
1471	- second byte is 0
1472	- no externally specified encoding
1473	- no encoding declaration
1474	by assuming UTF-16LE. But we don't, because this would mean when
1475	presented just with a single byte, we couldn't reliably determine
1476	whether we needed further bytes. */
1477	if (state == XML_CONTENT_STATE)
1478	break;
1479	*encPtr = encodingTable[UTF_16LE_ENC];
1480	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1481	}
1482	break;
1483	}
1484	}
1485	*encPtr = encodingTable[(int)INIT_ENC_INDEX(enc)];
1486	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1487	}
1488
1489
1490	#define NS(x) x
1491	#define ns(x) x
1492	#include "xmltok_ns.c"
1493	#undef NS
1494	#undef ns
1495
1496	#ifdef XML_NS
1497
1498	#define NS(x) x ## NS
1499	#define ns(x) x ## _ns
1500
1501	#include "xmltok_ns.c"
1502
1503	#undef NS
1504	#undef ns
1505
1506	ENCODING *
1507	XmlInitUnknownEncodingNS(void *mem,
1508	int *table,
1509	int (convert)(void userData, const char *p),
1510	void *userData)
1511	{
1512	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1513	if (enc)
1514	((struct normal_encoding *)enc)->type[':'] = BT_COLON;
1515	return enc;
1516	}
1517
1518	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: