Context Navigation

source: protocols/jabber/xmltok.c @ 266fe2f

Last change on this file since 266fe2f was b7d3cc34, checked in by Wilmer van der Gaast <wilmer@…>, at 2005-11-06T18:23:18Z
Initial repository (0.99 release tree)
Property mode set to `100644`
File size: 40.0 KB

Rev	Line
[b7d3cc34]	1	/*
	2	The contents of this file are subject to the Mozilla Public License
	3	Version 1.1 (the "License"); you may not use this file except in
	4	compliance with the License. You may obtain a copy of the License at
	5	http://www.mozilla.org/MPL/
	6
	7	Software distributed under the License is distributed on an "AS IS"
	8	basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
	9	License for the specific language governing rights and limitations
	10	under the License.
	11
	12	The Original Code is expat.
	13
	14	The Initial Developer of the Original Code is James Clark.
	15	Portions created by James Clark are Copyright (C) 1998, 1999
	16	James Clark. All Rights Reserved.
	17
	18	Contributor(s):
	19
	20	*/
	21
	22	#include "xmldef.h"
	23	#include "xmltok.h"
	24	#include "nametab.h"
	25
	26	#define VTABLE1 \
	27	{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
	28	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
	29	PREFIX(sameName), \
	30	PREFIX(nameMatchesAscii), \
	31	PREFIX(nameLength), \
	32	PREFIX(skipS), \
	33	PREFIX(getAtts), \
	34	PREFIX(charRefNumber), \
	35	PREFIX(predefinedEntityName), \
	36	PREFIX(updatePosition), \
	37	PREFIX(isPublicId)
	38
	39	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
	40
	41	#define UCS2_GET_NAMING(pages, hi, lo) \
	42	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
	43
	44	/* A 2 byte UTF-8 representation splits the characters 11 bits
	45	between the bottom 5 and 6 bits of the bytes.
	46	We need 8 bits to index into pages, 3 bits to add to that index and
	47	5 bits to generate the mask. */
	48	#define UTF8_GET_NAMING2(pages, byte) \
	49	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
	50	+ ((((byte)[0]) & 3) << 1) \
	51	+ ((((byte)[1]) >> 5) & 1)] \
	52	& (1 << (((byte)[1]) & 0x1F)))
	53
	54	/* A 3 byte UTF-8 representation splits the characters 16 bits
	55	between the bottom 4, 6 and 6 bits of the bytes.
	56	We need 8 bits to index into pages, 3 bits to add to that index and
	57	5 bits to generate the mask. */
	58	#define UTF8_GET_NAMING3(pages, byte) \
	59	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
	60	+ ((((byte)[1]) >> 2) & 0xF)] \
	61	<< 3) \
	62	+ ((((byte)[1]) & 3) << 1) \
	63	+ ((((byte)[2]) >> 5) & 1)] \
	64	& (1 << (((byte)[2]) & 0x1F)))
	65
	66	#define UTF8_GET_NAMING(pages, p, n) \
	67	((n) == 2 \
	68	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
	69	: ((n) == 3 \
	70	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
	71	: 0))
	72
	73	#define UTF8_INVALID3(p) \
	74	((*p) == 0xED \
	75	? (((p)[1] & 0x20) != 0) \
	76	: ((*p) == 0xEF \
	77	? ((p)[1] == 0xBF && ((p)[2] == 0xBF \|\| (p)[2] == 0xBE)) \
	78	: 0))
	79
	80	#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
	81
	82	static
	83	int isNever(const ENCODING enc, const char p)
	84	{
	85	return 0;
	86	}
	87
	88	static
	89	int utf8_isName2(const ENCODING enc, const char p)
	90	{
	91	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
	92	}
	93
	94	static
	95	int utf8_isName3(const ENCODING enc, const char p)
	96	{
	97	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
	98	}
	99
	100	#define utf8_isName4 isNever
	101
	102	static
	103	int utf8_isNmstrt2(const ENCODING enc, const char p)
	104	{
	105	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
	106	}
	107
	108	static
	109	int utf8_isNmstrt3(const ENCODING enc, const char p)
	110	{
	111	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
	112	}
	113
	114	#define utf8_isNmstrt4 isNever
	115
	116	#define utf8_isInvalid2 isNever
	117
	118	static
	119	int utf8_isInvalid3(const ENCODING enc, const char p)
	120	{
	121	return UTF8_INVALID3((const unsigned char *)p);
	122	}
	123
	124	static
	125	int utf8_isInvalid4(const ENCODING enc, const char p)
	126	{
	127	return UTF8_INVALID4((const unsigned char *)p);
	128	}
	129
	130	struct normal_encoding {
	131	ENCODING enc;
	132	unsigned char type[256];
	133	#ifdef XML_MIN_SIZE
	134	int (byteType)(const ENCODING , const char *);
	135	int (isNameMin)(const ENCODING , const char *);
	136	int (isNmstrtMin)(const ENCODING , const char *);
	137	int (byteToAscii)(const ENCODING , const char *);
	138	int (charMatches)(const ENCODING , const char *, int);
	139	#endif /* XML_MIN_SIZE */
	140	int (isName2)(const ENCODING , const char *);
	141	int (isName3)(const ENCODING , const char *);
	142	int (isName4)(const ENCODING , const char *);
	143	int (isNmstrt2)(const ENCODING , const char *);
	144	int (isNmstrt3)(const ENCODING , const char *);
	145	int (isNmstrt4)(const ENCODING , const char *);
	146	int (isInvalid2)(const ENCODING , const char *);
	147	int (isInvalid3)(const ENCODING , const char *);
	148	int (isInvalid4)(const ENCODING , const char *);
	149	};
	150
	151	#ifdef XML_MIN_SIZE
	152
	153	#define STANDARD_VTABLE(E) \
	154	E ## byteType, \
	155	E ## isNameMin, \
	156	E ## isNmstrtMin, \
	157	E ## byteToAscii, \
	158	E ## charMatches,
	159
	160	#else
	161
	162	#define STANDARD_VTABLE(E) /* as nothing */
	163
	164	#endif
	165
	166	#define NORMAL_VTABLE(E) \
	167	E ## isName2, \
	168	E ## isName3, \
	169	E ## isName4, \
	170	E ## isNmstrt2, \
	171	E ## isNmstrt3, \
	172	E ## isNmstrt4, \
	173	E ## isInvalid2, \
	174	E ## isInvalid3, \
	175	E ## isInvalid4
	176
	177	static int checkCharRefNumber(int);
	178
	179	#include "xmltok_impl.h"
	180
	181	#ifdef XML_MIN_SIZE
	182	#define sb_isNameMin isNever
	183	#define sb_isNmstrtMin isNever
	184	#endif
	185
	186	#ifdef XML_MIN_SIZE
	187	#define MINBPC(enc) ((enc)->minBytesPerChar)
	188	#else
	189	/* minimum bytes per character */
	190	#define MINBPC(enc) 1
	191	#endif
	192
	193	#define SB_BYTE_TYPE(enc, p) \
	194	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
	195
	196	#ifdef XML_MIN_SIZE
	197	static
	198	int sb_byteType(const ENCODING enc, const char p)
	199	{
	200	return SB_BYTE_TYPE(enc, p);
	201	}
	202	#define BYTE_TYPE(enc, p) \
	203	(((const struct normal_encoding *)(enc))->byteType(enc, p))
	204	#else
	205	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
	206	#endif
	207
	208	#ifdef XML_MIN_SIZE
	209	#define BYTE_TO_ASCII(enc, p) \
	210	(((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
	211	static
	212	int sb_byteToAscii(const ENCODING enc, const char p)
	213	{
	214	return *p;
	215	}
	216	#else
	217	#define BYTE_TO_ASCII(enc, p) (*p)
	218	#endif
	219
	220	#define IS_NAME_CHAR(enc, p, n) \
	221	(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
	222	#define IS_NMSTRT_CHAR(enc, p, n) \
	223	(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
	224	#define IS_INVALID_CHAR(enc, p, n) \
	225	(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
	226
	227	#ifdef XML_MIN_SIZE
	228	#define IS_NAME_CHAR_MINBPC(enc, p) \
	229	(((const struct normal_encoding *)(enc))->isNameMin(enc, p))
	230	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
	231	(((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
	232	#else
	233	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
	234	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
	235	#endif
	236
	237	#ifdef XML_MIN_SIZE
	238	#define CHAR_MATCHES(enc, p, c) \
	239	(((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
	240	static
	241	int sb_charMatches(const ENCODING enc, const char p, int c)
	242	{
	243	return *p == c;
	244	}
	245	#else
	246	/* c is an ASCII character */
	247	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
	248	#endif
	249
	250	#define PREFIX(ident) normal_ ## ident
	251	#include "xmltok_impl.c"
	252
	253	#undef MINBPC
	254	#undef BYTE_TYPE
	255	#undef BYTE_TO_ASCII
	256	#undef CHAR_MATCHES
	257	#undef IS_NAME_CHAR
	258	#undef IS_NAME_CHAR_MINBPC
	259	#undef IS_NMSTRT_CHAR
	260	#undef IS_NMSTRT_CHAR_MINBPC
	261	#undef IS_INVALID_CHAR
	262
	263	enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
	264	UTF8_cval1 = 0x00,
	265	UTF8_cval2 = 0xc0,
	266	UTF8_cval3 = 0xe0,
	267	UTF8_cval4 = 0xf0
	268	};
	269
	270	static
	271	void utf8_toUtf8(const ENCODING *enc,
	272	const char *fromP, const char fromLim,
	273	char *toP, const char toLim)
	274	{
	275	char *to;
	276	const char *from;
	277	if (fromLim - fromP > toLim - toP) {
	278	/* Avoid copying partial characters. */
	279	for (fromLim = fromP + (toLim - toP); fromLim > *fromP; fromLim--)
	280	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
	281	break;
	282	}
	283	for (to = toP, from = fromP; from != fromLim; from++, to++)
	284	to = from;
	285	*fromP = from;
	286	*toP = to;
	287	}
	288
	289	static
	290	void utf8_toUtf16(const ENCODING *enc,
	291	const char *fromP, const char fromLim,
	292	unsigned short *toP, const unsigned short toLim)
	293	{
	294	unsigned short to = toP;
	295	const char from = fromP;
	296	while (from != fromLim && to != toLim) {
	297	switch (((struct normal_encoding )enc)->type[(unsigned char)from]) {
	298	case BT_LEAD2:
	299	*to++ = ((from[0] & 0x1f) << 6) \| (from[1] & 0x3f);
	300	from += 2;
	301	break;
	302	case BT_LEAD3:
	303	*to++ = ((from[0] & 0xf) << 12) \| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f);
	304	from += 3;
	305	break;
	306	case BT_LEAD4:
	307	{
	308	unsigned long n;
	309	if (to + 1 == toLim)
	310	break;
	311	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12) \| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
	312	n -= 0x10000;
	313	to[0] = (unsigned short)((n >> 10) \| 0xD800);
	314	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
	315	to += 2;
	316	from += 4;
	317	}
	318	break;
	319	default:
	320	to++ = from++;
	321	break;
	322	}
	323	}
	324	*fromP = from;
	325	*toP = to;
	326	}
	327
	328	#ifdef XML_NS
	329	static const struct normal_encoding utf8_encoding_ns = {
	330	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
	331	{
	332	#include "asciitab.h"
	333	#include "utf8tab.h"
	334	},
	335	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	336	};
	337	#endif
	338
	339	static const struct normal_encoding utf8_encoding = {
	340	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
	341	{
	342	#define BT_COLON BT_NMSTRT
	343	#include "asciitab.h"
	344	#undef BT_COLON
	345	#include "utf8tab.h"
	346	},
	347	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	348	};
	349
	350	#ifdef XML_NS
	351
	352	static const struct normal_encoding internal_utf8_encoding_ns = {
	353	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
	354	{
	355	#include "iasciitab.h"
	356	#include "utf8tab.h"
	357	},
	358	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	359	};
	360
	361	#endif
	362
	363	static const struct normal_encoding internal_utf8_encoding = {
	364	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
	365	{
	366	#define BT_COLON BT_NMSTRT
	367	#include "iasciitab.h"
	368	#undef BT_COLON
	369	#include "utf8tab.h"
	370	},
	371	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	372	};
	373
	374	static
	375	void latin1_toUtf8(const ENCODING *enc,
	376	const char *fromP, const char fromLim,
	377	char *toP, const char toLim)
	378	{
	379	for (;;) {
	380	unsigned char c;
	381	if (*fromP == fromLim)
	382	break;
	383	c = (unsigned char)**fromP;
	384	if (c & 0x80) {
	385	if (toLim - *toP < 2)
	386	break;
	387	(toP)++ = ((c >> 6) \| UTF8_cval2);
	388	(toP)++ = ((c & 0x3f) \| 0x80);
	389	(*fromP)++;
	390	}
	391	else {
	392	if (*toP == toLim)
	393	break;
	394	(toP)++ = (fromP)++;
	395	}
	396	}
	397	}
	398
	399	static
	400	void latin1_toUtf16(const ENCODING *enc,
	401	const char *fromP, const char fromLim,
	402	unsigned short *toP, const unsigned short toLim)
	403	{
	404	while (fromP != fromLim && toP != toLim)
	405	(toP)++ = (unsigned char)(fromP)++;
	406	}
	407
	408	#ifdef XML_NS
	409
	410	static const struct normal_encoding latin1_encoding_ns = {
	411	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
	412	{
	413	#include "asciitab.h"
	414	#include "latin1tab.h"
	415	},
	416	STANDARD_VTABLE(sb_)
	417	};
	418
	419	#endif
	420
	421	static const struct normal_encoding latin1_encoding = {
	422	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
	423	{
	424	#define BT_COLON BT_NMSTRT
	425	#include "asciitab.h"
	426	#undef BT_COLON
	427	#include "latin1tab.h"
	428	},
	429	STANDARD_VTABLE(sb_)
	430	};
	431
	432	static
	433	void ascii_toUtf8(const ENCODING *enc,
	434	const char *fromP, const char fromLim,
	435	char *toP, const char toLim)
	436	{
	437	while (fromP != fromLim && toP != toLim)
	438	(toP)++ = (fromP)++;
	439	}
	440
	441	#ifdef XML_NS
	442
	443	static const struct normal_encoding ascii_encoding_ns = {
	444	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
	445	{
	446	#include "asciitab.h"
	447	/* BT_NONXML == 0 */
	448	},
	449	STANDARD_VTABLE(sb_)
	450	};
	451
	452	#endif
	453
	454	static const struct normal_encoding ascii_encoding = {
	455	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
	456	{
	457	#define BT_COLON BT_NMSTRT
	458	#include "asciitab.h"
	459	#undef BT_COLON
	460	/* BT_NONXML == 0 */
	461	},
	462	STANDARD_VTABLE(sb_)
	463	};
	464
	465	static int unicode_byte_type(char hi, char lo)
	466	{
	467	switch ((unsigned char)hi) {
	468	case 0xD8: case 0xD9: case 0xDA: case 0xDB:
	469	return BT_LEAD4;
	470	case 0xDC: case 0xDD: case 0xDE: case 0xDF:
	471	return BT_TRAIL;
	472	case 0xFF:
	473	switch ((unsigned char)lo) {
	474	case 0xFF:
	475	case 0xFE:
	476	return BT_NONXML;
	477	}
	478	break;
	479	}
	480	return BT_NONASCII;
	481	}
	482
	483	#define DEFINE_UTF16_TO_UTF8(E) \
	484	static \
	485	void E ## toUtf8(const ENCODING *enc, \
	486	const char *fromP, const char fromLim, \
	487	char *toP, const char toLim) \
	488	{ \
	489	const char *from; \
	490	for (from = *fromP; from != fromLim; from += 2) { \
	491	int plane; \
	492	unsigned char lo2; \
	493	unsigned char lo = GET_LO(from); \
	494	unsigned char hi = GET_HI(from); \
	495	switch (hi) { \
	496	case 0: \
	497	if (lo < 0x80) { \
	498	if (*toP == toLim) { \
	499	*fromP = from; \
	500	return; \
	501	} \
	502	(toP)++ = lo; \
	503	break; \
	504	} \
	505	/* fall through */ \
	506	case 0x1: case 0x2: case 0x3: \
	507	case 0x4: case 0x5: case 0x6: case 0x7: \
	508	if (toLim - *toP < 2) { \
	509	*fromP = from; \
	510	return; \
	511	} \
	512	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
	513	(toP)++ = ((lo & 0x3f) \| 0x80); \
	514	break; \
	515	default: \
	516	if (toLim - *toP < 3) { \
	517	*fromP = from; \
	518	return; \
	519	} \
	520	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
	521	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
	522	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
	523	(toP)++ = ((lo & 0x3f) \| 0x80); \
	524	break; \
	525	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
	526	if (toLim - *toP < 4) { \
	527	*fromP = from; \
	528	return; \
	529	} \
	530	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
	531	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
	532	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
	533	from += 2; \
	534	lo2 = GET_LO(from); \
	535	(toP)++ = (((lo & 0x3) << 4) \
	536	\| ((GET_HI(from) & 0x3) << 2) \
	537	\| (lo2 >> 6) \
	538	\| 0x80); \
	539	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
	540	break; \
	541	} \
	542	} \
	543	*fromP = from; \
	544	}
	545
	546	#define DEFINE_UTF16_TO_UTF16(E) \
	547	static \
	548	void E ## toUtf16(const ENCODING *enc, \
	549	const char *fromP, const char fromLim, \
	550	unsigned short *toP, const unsigned short toLim) \
	551	{ \
	552	/* Avoid copying first half only of surrogate */ \
	553	if (fromLim - fromP > ((toLim - toP) << 1) \
	554	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
	555	fromLim -= 2; \
	556	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
	557	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
	558	}
	559
	560	#define SET2(ptr, ch) \
	561	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
	562	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
	563	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
	564
	565	DEFINE_UTF16_TO_UTF8(little2_)
	566	DEFINE_UTF16_TO_UTF16(little2_)
	567
	568	#undef SET2
	569	#undef GET_LO
	570	#undef GET_HI
	571
	572	#define SET2(ptr, ch) \
	573	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
	574	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
	575	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
	576
	577	DEFINE_UTF16_TO_UTF8(big2_)
	578	DEFINE_UTF16_TO_UTF16(big2_)
	579
	580	#undef SET2
	581	#undef GET_LO
	582	#undef GET_HI
	583
	584	#define LITTLE2_BYTE_TYPE(enc, p) \
	585	((p)[1] == 0 \
	586	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
	587	: unicode_byte_type((p)[1], (p)[0]))
	588	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
	589	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
	590	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
	591	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
	592	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
	593	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
	594
	595	#ifdef XML_MIN_SIZE
	596
	597	static
	598	int little2_byteType(const ENCODING enc, const char p)
	599	{
	600	return LITTLE2_BYTE_TYPE(enc, p);
	601	}
	602
	603	static
	604	int little2_byteToAscii(const ENCODING enc, const char p)
	605	{
	606	return LITTLE2_BYTE_TO_ASCII(enc, p);
	607	}
	608
	609	static
	610	int little2_charMatches(const ENCODING enc, const char p, int c)
	611	{
	612	return LITTLE2_CHAR_MATCHES(enc, p, c);
	613	}
	614
	615	static
	616	int little2_isNameMin(const ENCODING enc, const char p)
	617	{
	618	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
	619	}
	620
	621	static
	622	int little2_isNmstrtMin(const ENCODING enc, const char p)
	623	{
	624	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
	625	}
	626
	627	#undef VTABLE
	628	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
	629
	630	#else /* not XML_MIN_SIZE */
	631
	632	#undef PREFIX
	633	#define PREFIX(ident) little2_ ## ident
	634	#define MINBPC(enc) 2
	635	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
	636	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
	637	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
	638	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
	639	#define IS_NAME_CHAR(enc, p, n) 0
	640	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
	641	#define IS_NMSTRT_CHAR(enc, p, n) (0)
	642	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
	643
	644	#include "xmltok_impl.c"
	645
	646	#undef MINBPC
	647	#undef BYTE_TYPE
	648	#undef BYTE_TO_ASCII
	649	#undef CHAR_MATCHES
	650	#undef IS_NAME_CHAR
	651	#undef IS_NAME_CHAR_MINBPC
	652	#undef IS_NMSTRT_CHAR
	653	#undef IS_NMSTRT_CHAR_MINBPC
	654	#undef IS_INVALID_CHAR
	655
	656	#endif /* not XML_MIN_SIZE */
	657
	658	#ifdef XML_NS
	659
	660	static const struct normal_encoding little2_encoding_ns = {
	661	{ VTABLE, 2, 0,
	662	#if XML_BYTE_ORDER == 12
	663	1
	664	#else
	665	0
	666	#endif
	667	},
	668	{
	669	#include "asciitab.h"
	670	#include "latin1tab.h"
	671	},
	672	STANDARD_VTABLE(little2_)
	673	};
	674
	675	#endif
	676
	677	static const struct normal_encoding little2_encoding = {
	678	{ VTABLE, 2, 0,
	679	#if XML_BYTE_ORDER == 12
	680	1
	681	#else
	682	0
	683	#endif
	684	},
	685	{
	686	#define BT_COLON BT_NMSTRT
	687	#include "asciitab.h"
	688	#undef BT_COLON
	689	#include "latin1tab.h"
	690	},
	691	STANDARD_VTABLE(little2_)
	692	};
	693
	694	#if XML_BYTE_ORDER != 21
	695
	696	#ifdef XML_NS
	697
	698	static const struct normal_encoding internal_little2_encoding_ns = {
	699	{ VTABLE, 2, 0, 1 },
	700	{
	701	#include "iasciitab.h"
	702	#include "latin1tab.h"
	703	},
	704	STANDARD_VTABLE(little2_)
	705	};
	706
	707	#endif
	708
	709	static const struct normal_encoding internal_little2_encoding = {
	710	{ VTABLE, 2, 0, 1 },
	711	{
	712	#define BT_COLON BT_NMSTRT
	713	#include "iasciitab.h"
	714	#undef BT_COLON
	715	#include "latin1tab.h"
	716	},
	717	STANDARD_VTABLE(little2_)
	718	};
	719
	720	#endif
	721
	722
	723	#define BIG2_BYTE_TYPE(enc, p) \
	724	((p)[0] == 0 \
	725	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
	726	: unicode_byte_type((p)[0], (p)[1]))
	727	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
	728	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
	729	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
	730	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
	731	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
	732	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
	733
	734	#ifdef XML_MIN_SIZE
	735
	736	static
	737	int big2_byteType(const ENCODING enc, const char p)
	738	{
	739	return BIG2_BYTE_TYPE(enc, p);
	740	}
	741
	742	static
	743	int big2_byteToAscii(const ENCODING enc, const char p)
	744	{
	745	return BIG2_BYTE_TO_ASCII(enc, p);
	746	}
	747
	748	static
	749	int big2_charMatches(const ENCODING enc, const char p, int c)
	750	{
	751	return BIG2_CHAR_MATCHES(enc, p, c);
	752	}
	753
	754	static
	755	int big2_isNameMin(const ENCODING enc, const char p)
	756	{
	757	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
	758	}
	759
	760	static
	761	int big2_isNmstrtMin(const ENCODING enc, const char p)
	762	{
	763	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
	764	}
	765
	766	#undef VTABLE
	767	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
	768
	769	#else /* not XML_MIN_SIZE */
	770
	771	#undef PREFIX
	772	#define PREFIX(ident) big2_ ## ident
	773	#define MINBPC(enc) 2
	774	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
	775	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
	776	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
	777	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
	778	#define IS_NAME_CHAR(enc, p, n) 0
	779	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
	780	#define IS_NMSTRT_CHAR(enc, p, n) (0)
	781	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
	782
	783	#include "xmltok_impl.c"
	784
	785	#undef MINBPC
	786	#undef BYTE_TYPE
	787	#undef BYTE_TO_ASCII
	788	#undef CHAR_MATCHES
	789	#undef IS_NAME_CHAR
	790	#undef IS_NAME_CHAR_MINBPC
	791	#undef IS_NMSTRT_CHAR
	792	#undef IS_NMSTRT_CHAR_MINBPC
	793	#undef IS_INVALID_CHAR
	794
	795	#endif /* not XML_MIN_SIZE */
	796
	797	#ifdef XML_NS
	798
	799	static const struct normal_encoding big2_encoding_ns = {
	800	{ VTABLE, 2, 0,
	801	#if XML_BYTE_ORDER == 21
	802	1
	803	#else
	804	0
	805	#endif
	806	},
	807	{
	808	#include "asciitab.h"
	809	#include "latin1tab.h"
	810	},
	811	STANDARD_VTABLE(big2_)
	812	};
	813
	814	#endif
	815
	816	static const struct normal_encoding big2_encoding = {
	817	{ VTABLE, 2, 0,
	818	#if XML_BYTE_ORDER == 21
	819	1
	820	#else
	821	0
	822	#endif
	823	},
	824	{
	825	#define BT_COLON BT_NMSTRT
	826	#include "asciitab.h"
	827	#undef BT_COLON
	828	#include "latin1tab.h"
	829	},
	830	STANDARD_VTABLE(big2_)
	831	};
	832
	833	#if XML_BYTE_ORDER != 12
	834
	835	#ifdef XML_NS
	836
	837	static const struct normal_encoding internal_big2_encoding_ns = {
	838	{ VTABLE, 2, 0, 1 },
	839	{
	840	#include "iasciitab.h"
	841	#include "latin1tab.h"
	842	},
	843	STANDARD_VTABLE(big2_)
	844	};
	845
	846	#endif
	847
	848	static const struct normal_encoding internal_big2_encoding = {
	849	{ VTABLE, 2, 0, 1 },
	850	{
	851	#define BT_COLON BT_NMSTRT
	852	#include "iasciitab.h"
	853	#undef BT_COLON
	854	#include "latin1tab.h"
	855	},
	856	STANDARD_VTABLE(big2_)
	857	};
	858
	859	#endif
	860
	861	#undef PREFIX
	862
	863	static
	864	int streqci(const char s1, const char s2)
	865	{
	866	for (;;) {
	867	char c1 = *s1++;
	868	char c2 = *s2++;
	869	if ('a' <= c1 && c1 <= 'z')
	870	c1 += 'A' - 'a';
	871	if ('a' <= c2 && c2 <= 'z')
	872	c2 += 'A' - 'a';
	873	if (c1 != c2)
	874	return 0;
	875	if (!c1)
	876	break;
	877	}
	878	return 1;
	879	}
	880
	881	static
	882	void initUpdatePosition(const ENCODING enc, const char ptr,
	883	const char end, POSITION pos)
	884	{
	885	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
	886	}
	887
	888	static
	889	int toAscii(const ENCODING enc, const char ptr, const char *end)
	890	{
	891	char buf[1];
	892	char *p = buf;
	893	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
	894	if (p == buf)
	895	return -1;
	896	else
	897	return buf[0];
	898	}
	899
	900	static
	901	int isSpace(int c)
	902	{
	903	switch (c) {
	904	case 0x20:
	905	case 0xD:
	906	case 0xA:
	907	case 0x9:
	908	return 1;
	909	}
	910	return 0;
	911	}
	912
	913	/* Return 1 if there's just optional white space
	914	or there's an S followed by name=val. */
	915	static
	916	int parsePseudoAttribute(const ENCODING *enc,
	917	const char *ptr,
	918	const char *end,
	919	const char **namePtr,
	920	const char **valPtr,
	921	const char **nextTokPtr)
	922	{
	923	int c;
	924	char open;
	925	if (ptr == end) {
	926	*namePtr = 0;
	927	return 1;
	928	}
	929	if (!isSpace(toAscii(enc, ptr, end))) {
	930	*nextTokPtr = ptr;
	931	return 0;
	932	}
	933	do {
	934	ptr += enc->minBytesPerChar;
	935	} while (isSpace(toAscii(enc, ptr, end)));
	936	if (ptr == end) {
	937	*namePtr = 0;
	938	return 1;
	939	}
	940	*namePtr = ptr;
	941	for (;;) {
	942	c = toAscii(enc, ptr, end);
	943	if (c == -1) {
	944	*nextTokPtr = ptr;
	945	return 0;
	946	}
	947	if (c == '=')
	948	break;
	949	if (isSpace(c)) {
	950	do {
	951	ptr += enc->minBytesPerChar;
	952	} while (isSpace(c = toAscii(enc, ptr, end)));
	953	if (c != '=') {
	954	*nextTokPtr = ptr;
	955	return 0;
	956	}
	957	break;
	958	}
	959	ptr += enc->minBytesPerChar;
	960	}
	961	if (ptr == *namePtr) {
	962	*nextTokPtr = ptr;
	963	return 0;
	964	}
	965	ptr += enc->minBytesPerChar;
	966	c = toAscii(enc, ptr, end);
	967	while (isSpace(c)) {
	968	ptr += enc->minBytesPerChar;
	969	c = toAscii(enc, ptr, end);
	970	}
	971	if (c != '"' && c != '\'') {
	972	*nextTokPtr = ptr;
	973	return 0;
	974	}
	975	open = c;
	976	ptr += enc->minBytesPerChar;
	977	*valPtr = ptr;
	978	for (;; ptr += enc->minBytesPerChar) {
	979	c = toAscii(enc, ptr, end);
	980	if (c == open)
	981	break;
	982	if (!('a' <= c && c <= 'z')
	983	&& !('A' <= c && c <= 'Z')
	984	&& !('0' <= c && c <= '9')
	985	&& c != '.'
	986	&& c != '-'
	987	&& c != '_') {
	988	*nextTokPtr = ptr;
	989	return 0;
	990	}
	991	}
	992	*nextTokPtr = ptr + enc->minBytesPerChar;
	993	return 1;
	994	}
	995
	996	static
	997	int doParseXmlDecl(const ENCODING (encodingFinder)(const ENCODING *,
	998	const char *,
	999	const char *),
	1000	int isGeneralTextEntity,
	1001	const ENCODING *enc,
	1002	const char *ptr,
	1003	const char *end,
	1004	const char **badPtr,
	1005	const char **versionPtr,
	1006	const char **encodingName,
	1007	const ENCODING **encoding,
	1008	int *standalone)
	1009	{
	1010	const char *val = 0;
	1011	const char *name = 0;
	1012	ptr += 5 * enc->minBytesPerChar;
	1013	end -= 2 * enc->minBytesPerChar;
	1014	if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) \|\| !name) {
	1015	*badPtr = ptr;
	1016	return 0;
	1017	}
	1018	if (!XmlNameMatchesAscii(enc, name, "version")) {
	1019	if (!isGeneralTextEntity) {
	1020	*badPtr = name;
	1021	return 0;
	1022	}
	1023	}
	1024	else {
	1025	if (versionPtr)
	1026	*versionPtr = val;
	1027	if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
	1028	*badPtr = ptr;
	1029	return 0;
	1030	}
	1031	if (!name) {
	1032	if (isGeneralTextEntity) {
	1033	/* a TextDecl must have an EncodingDecl */
	1034	*badPtr = ptr;
	1035	return 0;
	1036	}
	1037	return 1;
	1038	}
	1039	}
	1040	if (XmlNameMatchesAscii(enc, name, "encoding")) {
	1041	int c = toAscii(enc, val, end);
	1042	if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
	1043	*badPtr = val;
	1044	return 0;
	1045	}
	1046	if (encodingName)
	1047	*encodingName = val;
	1048	if (encoding)
	1049	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
	1050	if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
	1051	*badPtr = ptr;
	1052	return 0;
	1053	}
	1054	if (!name)
	1055	return 1;
	1056	}
	1057	if (!XmlNameMatchesAscii(enc, name, "standalone") \|\| isGeneralTextEntity) {
	1058	*badPtr = name;
	1059	return 0;
	1060	}
	1061	if (XmlNameMatchesAscii(enc, val, "yes")) {
	1062	if (standalone)
	1063	*standalone = 1;
	1064	}
	1065	else if (XmlNameMatchesAscii(enc, val, "no")) {
	1066	if (standalone)
	1067	*standalone = 0;
	1068	}
	1069	else {
	1070	*badPtr = val;
	1071	return 0;
	1072	}
	1073	while (isSpace(toAscii(enc, ptr, end)))
	1074	ptr += enc->minBytesPerChar;
	1075	if (ptr != end) {
	1076	*badPtr = ptr;
	1077	return 0;
	1078	}
	1079	return 1;
	1080	}
	1081
	1082	static
	1083	int checkCharRefNumber(int result)
	1084	{
	1085	switch (result >> 8) {
	1086	case 0xD8: case 0xD9: case 0xDA: case 0xDB:
	1087	case 0xDC: case 0xDD: case 0xDE: case 0xDF:
	1088	return -1;
	1089	case 0:
	1090	if (latin1_encoding.type[result] == BT_NONXML)
	1091	return -1;
	1092	break;
	1093	case 0xFF:
	1094	if (result == 0xFFFE \|\| result == 0xFFFF)
	1095	return -1;
	1096	break;
	1097	}
	1098	return result;
	1099	}
	1100
	1101	int XmlUtf8Encode(int c, char *buf)
	1102	{
	1103	enum {
	1104	/* minN is minimum legal resulting value for N byte sequence */
	1105	min2 = 0x80,
	1106	min3 = 0x800,
	1107	min4 = 0x10000
	1108	};
	1109
	1110	if (c < 0)
	1111	return 0;
	1112	if (c < min2) {
	1113	buf[0] = (c \| UTF8_cval1);
	1114	return 1;
	1115	}
	1116	if (c < min3) {
	1117	buf[0] = ((c >> 6) \| UTF8_cval2);
	1118	buf[1] = ((c & 0x3f) \| 0x80);
	1119	return 2;
	1120	}
	1121	if (c < min4) {
	1122	buf[0] = ((c >> 12) \| UTF8_cval3);
	1123	buf[1] = (((c >> 6) & 0x3f) \| 0x80);
	1124	buf[2] = ((c & 0x3f) \| 0x80);
	1125	return 3;
	1126	}
	1127	if (c < 0x110000) {
	1128	buf[0] = ((c >> 18) \| UTF8_cval4);
	1129	buf[1] = (((c >> 12) & 0x3f) \| 0x80);
	1130	buf[2] = (((c >> 6) & 0x3f) \| 0x80);
	1131	buf[3] = ((c & 0x3f) \| 0x80);
	1132	return 4;
	1133	}
	1134	return 0;
	1135	}
	1136
	1137	int XmlUtf16Encode(int charNum, unsigned short *buf)
	1138	{
	1139	if (charNum < 0)
	1140	return 0;
	1141	if (charNum < 0x10000) {
	1142	buf[0] = charNum;
	1143	return 1;
	1144	}
	1145	if (charNum < 0x110000) {
	1146	charNum -= 0x10000;
	1147	buf[0] = (charNum >> 10) + 0xD800;
	1148	buf[1] = (charNum & 0x3FF) + 0xDC00;
	1149	return 2;
	1150	}
	1151	return 0;
	1152	}
	1153
	1154	struct unknown_encoding {
	1155	struct normal_encoding normal;
	1156	int (convert)(void userData, const char *p);
	1157	void *userData;
	1158	unsigned short utf16[256];
	1159	char utf8[256][4];
	1160	};
	1161
	1162	int XmlSizeOfUnknownEncoding()
	1163	{
	1164	return sizeof(struct unknown_encoding);
	1165	}
	1166
	1167	static
	1168	int unknown_isName(const ENCODING enc, const char p)
	1169	{
	1170	int c = ((const struct unknown_encoding *)enc)
	1171	->convert(((const struct unknown_encoding *)enc)->userData, p);
	1172	if (c & ~0xFFFF)
	1173	return 0;
	1174	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
	1175	}
	1176
	1177	static
	1178	int unknown_isNmstrt(const ENCODING enc, const char p)
	1179	{
	1180	int c = ((const struct unknown_encoding *)enc)
	1181	->convert(((const struct unknown_encoding *)enc)->userData, p);
	1182	if (c & ~0xFFFF)
	1183	return 0;
	1184	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
	1185	}
	1186
	1187	static
	1188	int unknown_isInvalid(const ENCODING enc, const char p)
	1189	{
	1190	int c = ((const struct unknown_encoding *)enc)
	1191	->convert(((const struct unknown_encoding *)enc)->userData, p);
	1192	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
	1193	}
	1194
	1195	static
	1196	void unknown_toUtf8(const ENCODING *enc,
	1197	const char *fromP, const char fromLim,
	1198	char *toP, const char toLim)
	1199	{
	1200	char buf[XML_UTF8_ENCODE_MAX];
	1201	for (;;) {
	1202	const char *utf8;
	1203	int n;
	1204	if (*fromP == fromLim)
	1205	break;
	1206	utf8 = ((const struct unknown_encoding )enc)->utf8[(unsigned char)*fromP];
	1207	n = *utf8++;
	1208	if (n == 0) {
	1209	int c = ((const struct unknown_encoding *)enc)
	1210	->convert(((const struct unknown_encoding )enc)->userData, fromP);
	1211	n = XmlUtf8Encode(c, buf);
	1212	if (n > toLim - *toP)
	1213	break;
	1214	utf8 = buf;
	1215	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
	1216	- (BT_LEAD2 - 2);
	1217	}
	1218	else {
	1219	if (n > toLim - *toP)
	1220	break;
	1221	(*fromP)++;
	1222	}
	1223	do {
	1224	(toP)++ = *utf8++;
	1225	} while (--n != 0);
	1226	}
	1227	}
	1228
	1229	static
	1230	void unknown_toUtf16(const ENCODING *enc,
	1231	const char *fromP, const char fromLim,
	1232	unsigned short *toP, const unsigned short toLim)
	1233	{
	1234	while (fromP != fromLim && toP != toLim) {
	1235	unsigned short c
	1236	= ((const struct unknown_encoding )enc)->utf16[(unsigned char)*fromP];
	1237	if (c == 0) {
	1238	c = (unsigned short)((const struct unknown_encoding *)enc)
	1239	->convert(((const struct unknown_encoding )enc)->userData, fromP);
	1240	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
	1241	- (BT_LEAD2 - 2);
	1242	}
	1243	else
	1244	(*fromP)++;
	1245	(toP)++ = c;
	1246	}
	1247	}
	1248
	1249	ENCODING *
	1250	XmlInitUnknownEncoding(void *mem,
	1251	int *table,
	1252	int (convert)(void userData, const char *p),
	1253	void *userData)
	1254	{
	1255	int i;
	1256	struct unknown_encoding *e = mem;
	1257	for (i = 0; i < sizeof(struct normal_encoding); i++)
	1258	((char )mem)[i] = ((char )&latin1_encoding)[i];
	1259	for (i = 0; i < 128; i++)
	1260	if (latin1_encoding.type[i] != BT_OTHER
	1261	&& latin1_encoding.type[i] != BT_NONXML
	1262	&& table[i] != i)
	1263	return 0;
	1264	for (i = 0; i < 256; i++) {
	1265	int c = table[i];
	1266	if (c == -1) {
	1267	e->normal.type[i] = BT_MALFORM;
	1268	/* This shouldn't really get used. */
	1269	e->utf16[i] = 0xFFFF;
	1270	e->utf8[i][0] = 1;
	1271	e->utf8[i][1] = 0;
	1272	}
	1273	else if (c < 0) {
	1274	if (c < -4)
	1275	return 0;
	1276	e->normal.type[i] = BT_LEAD2 - (c + 2);
	1277	e->utf8[i][0] = 0;
	1278	e->utf16[i] = 0;
	1279	}
	1280	else if (c < 0x80) {
	1281	if (latin1_encoding.type[c] != BT_OTHER
	1282	&& latin1_encoding.type[c] != BT_NONXML
	1283	&& c != i)
	1284	return 0;
	1285	e->normal.type[i] = latin1_encoding.type[c];
	1286	e->utf8[i][0] = 1;
	1287	e->utf8[i][1] = (char)c;
	1288	e->utf16[i] = c == 0 ? 0xFFFF : c;
	1289	}
	1290	else if (checkCharRefNumber(c) < 0) {
	1291	e->normal.type[i] = BT_NONXML;
	1292	/* This shouldn't really get used. */
	1293	e->utf16[i] = 0xFFFF;
	1294	e->utf8[i][0] = 1;
	1295	e->utf8[i][1] = 0;
	1296	}
	1297	else {
	1298	if (c > 0xFFFF)
	1299	return 0;
	1300	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
	1301	e->normal.type[i] = BT_NMSTRT;
	1302	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
	1303	e->normal.type[i] = BT_NAME;
	1304	else
	1305	e->normal.type[i] = BT_OTHER;
	1306	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
	1307	e->utf16[i] = c;
	1308	}
	1309	}
	1310	e->userData = userData;
	1311	e->convert = convert;
	1312	if (convert) {
	1313	e->normal.isName2 = unknown_isName;
	1314	e->normal.isName3 = unknown_isName;
	1315	e->normal.isName4 = unknown_isName;
	1316	e->normal.isNmstrt2 = unknown_isNmstrt;
	1317	e->normal.isNmstrt3 = unknown_isNmstrt;
	1318	e->normal.isNmstrt4 = unknown_isNmstrt;
	1319	e->normal.isInvalid2 = unknown_isInvalid;
	1320	e->normal.isInvalid3 = unknown_isInvalid;
	1321	e->normal.isInvalid4 = unknown_isInvalid;
	1322	}
	1323	e->normal.enc.utf8Convert = unknown_toUtf8;
	1324	e->normal.enc.utf16Convert = unknown_toUtf16;
	1325	return &(e->normal.enc);
	1326	}
	1327
	1328	/* If this enumeration is changed, getEncodingIndex and encodings
	1329	must also be changed. */
	1330	enum {
	1331	UNKNOWN_ENC = -1,
	1332	ISO_8859_1_ENC = 0,
	1333	US_ASCII_ENC,
	1334	UTF_8_ENC,
	1335	UTF_16_ENC,
	1336	UTF_16BE_ENC,
	1337	UTF_16LE_ENC,
	1338	/* must match encodingNames up to here */
	1339	NO_ENC
	1340	};
	1341
	1342	static
	1343	int getEncodingIndex(const char *name)
	1344	{
	1345	static const char *encodingNames[] = {
	1346	"ISO-8859-1",
	1347	"US-ASCII",
	1348	"UTF-8",
	1349	"UTF-16",
	1350	"UTF-16BE"
	1351	"UTF-16LE",
	1352	};
	1353	int i;
	1354	if (name == 0)
	1355	return NO_ENC;
	1356	for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
	1357	if (streqci(name, encodingNames[i]))
	1358	return i;
	1359	return UNKNOWN_ENC;
	1360	}
	1361
	1362	/* For binary compatibility, we store the index of the encoding specified
	1363	at initialization in the isUtf16 member. */
	1364
	1365	#define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
	1366
	1367	/* This is what detects the encoding.
	1368	encodingTable maps from encoding indices to encodings;
	1369	INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
	1370	state is XML_CONTENT_STATE if we're parsing an external text entity,
	1371	and XML_PROLOG_STATE otherwise.
	1372	*/
	1373
	1374
	1375	static
	1376	int initScan(const ENCODING **encodingTable,
	1377	const INIT_ENCODING *enc,
	1378	int state,
	1379	const char *ptr,
	1380	const char *end,
	1381	const char **nextTokPtr)
	1382	{
	1383	const ENCODING **encPtr;
	1384
	1385	if (ptr == end)
	1386	return XML_TOK_NONE;
	1387	encPtr = enc->encPtr;
	1388	if (ptr + 1 == end) {
	1389	/* only a single byte available for auto-detection */
	1390	/* a well-formed document entity must have more than one byte */
	1391	if (state != XML_CONTENT_STATE)
	1392	return XML_TOK_PARTIAL;
	1393	/* so we're parsing an external text entity... */
	1394	/* if UTF-16 was externally specified, then we need at least 2 bytes */
	1395	switch (INIT_ENC_INDEX(enc)) {
	1396	case UTF_16_ENC:
	1397	case UTF_16LE_ENC:
	1398	case UTF_16BE_ENC:
	1399	return XML_TOK_PARTIAL;
	1400	}
	1401	switch ((unsigned char)*ptr) {
	1402	case 0xFE:
	1403	case 0xFF:
	1404	case 0xEF: /* possibly first byte of UTF-8 BOM */
	1405	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
	1406	&& state == XML_CONTENT_STATE)
	1407	break;
	1408	/* fall through */
	1409	case 0x00:
	1410	case 0x3C:
	1411	return XML_TOK_PARTIAL;
	1412	}
	1413	}
	1414	else {
	1415	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1]) {
	1416	case 0xFEFF:
	1417	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
	1418	&& state == XML_CONTENT_STATE)
	1419	break;
	1420	*nextTokPtr = ptr + 2;
	1421	*encPtr = encodingTable[UTF_16BE_ENC];
	1422	return XML_TOK_BOM;
	1423	/* 00 3C is handled in the default case */
	1424	case 0x3C00:
	1425	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
	1426	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
	1427	&& state == XML_CONTENT_STATE)
	1428	break;
	1429	*encPtr = encodingTable[UTF_16LE_ENC];
	1430	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1431	case 0xFFFE:
	1432	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
	1433	&& state == XML_CONTENT_STATE)
	1434	break;
	1435	*nextTokPtr = ptr + 2;
	1436	*encPtr = encodingTable[UTF_16LE_ENC];
	1437	return XML_TOK_BOM;
	1438	case 0xEFBB:
	1439	/* Maybe a UTF-8 BOM (EF BB BF) */
	1440	/* If there's an explicitly specified (external) encoding
	1441	of ISO-8859-1 or some flavour of UTF-16
	1442	and this is an external text entity,
	1443	don't look for the BOM,
	1444	because it might be a legal data. */
	1445	if (state == XML_CONTENT_STATE) {
	1446	int e = INIT_ENC_INDEX(enc);
	1447	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC \|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
	1448	break;
	1449	}
	1450	if (ptr + 2 == end)
	1451	return XML_TOK_PARTIAL;
	1452	if ((unsigned char)ptr[2] == 0xBF) {
	1453	*encPtr = encodingTable[UTF_8_ENC];
	1454	return XML_TOK_BOM;
	1455	}
	1456	break;
	1457	default:
	1458	if (ptr[0] == '\0') {
	1459	/* 0 isn't a legal data character. Furthermore a document entity can only
	1460	start with ASCII characters. So the only way this can fail to be big-endian
	1461	UTF-16 if it it's an external parsed general entity that's labelled as
	1462	UTF-16LE. */
	1463	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
	1464	break;
	1465	*encPtr = encodingTable[UTF_16BE_ENC];
	1466	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1467	}
	1468	else if (ptr[1] == '\0') {
	1469	/* We could recover here in the case:
	1470	- parsing an external entity
	1471	- second byte is 0
	1472	- no externally specified encoding
	1473	- no encoding declaration
	1474	by assuming UTF-16LE. But we don't, because this would mean when
	1475	presented just with a single byte, we couldn't reliably determine
	1476	whether we needed further bytes. */
	1477	if (state == XML_CONTENT_STATE)
	1478	break;
	1479	*encPtr = encodingTable[UTF_16LE_ENC];
	1480	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1481	}
	1482	break;
	1483	}
	1484	}
	1485	*encPtr = encodingTable[(int)INIT_ENC_INDEX(enc)];
	1486	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1487	}
	1488
	1489
	1490	#define NS(x) x
	1491	#define ns(x) x
	1492	#include "xmltok_ns.c"
	1493	#undef NS
	1494	#undef ns
	1495
	1496	#ifdef XML_NS
	1497
	1498	#define NS(x) x ## NS
	1499	#define ns(x) x ## _ns
	1500
	1501	#include "xmltok_ns.c"
	1502
	1503	#undef NS
	1504	#undef ns
	1505
	1506	ENCODING *
	1507	XmlInitUnknownEncodingNS(void *mem,
	1508	int *table,
	1509	int (convert)(void userData, const char *p),
	1510	void *userData)
	1511	{
	1512	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
	1513	if (enc)
	1514	((struct normal_encoding *)enc)->type[':'] = BT_COLON;
	1515	return enc;
	1516	}
	1517
	1518	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: