HTMLparser.c 191 KB


  1. /*
  2. * HTMLparser.c : an HTML 4.0 non-verifying parser
  3. *
  4. * See Copyright for the status of this software.
  5. *
  6. * daniel@veillard.com
  7. */
  8. #define IN_LIBXML
  9. #include "libxml.h"
  10. #ifdef LIBXML_HTML_ENABLED
  11. #include <string.h>
  12. #ifdef HAVE_CTYPE_H
  13. #include <ctype.h>
  14. #endif
  15. #ifdef HAVE_STDLIB_H
  16. #include <stdlib.h>
  17. #endif
  18. #ifdef HAVE_SYS_STAT_H
  19. #include <sys/stat.h>
  20. #endif
  21. #ifdef HAVE_FCNTL_H
  22. #include <fcntl.h>
  23. #endif
  24. #ifdef HAVE_UNISTD_H
  25. #include <unistd.h>
  26. #endif
  27. #ifdef HAVE_ZLIB_H
  28. #include <zlib.h>
  29. #endif
  30. #include <libxml/xmlmemory.h>
  31. #include <libxml/tree.h>
  32. #include <libxml/parser.h>
  33. #include <libxml/parserInternals.h>
  34. #include <libxml/xmlerror.h>
  35. #include <libxml/HTMLparser.h>
  36. #include <libxml/HTMLtree.h>
  37. #include <libxml/entities.h>
  38. #include <libxml/encoding.h>
  39. #include <libxml/valid.h>
  40. #include <libxml/xmlIO.h>
  41. #include <libxml/globals.h>
  42. #include <libxml/uri.h>
  43. #define HTML_MAX_NAMELEN 1000
  44. #define HTML_PARSER_BIG_BUFFER_SIZE 1000
  45. #define HTML_PARSER_BUFFER_SIZE 100
  46. /* #define DEBUG */
  47. /* #define DEBUG_PUSH */
  48. static int htmlOmittedDefaultValue = 1;
  49. xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
  50. xmlChar end, xmlChar end2, xmlChar end3);
  51. static void htmlParseComment(htmlParserCtxtPtr ctxt);
  52. /************************************************************************
  53. * *
  54. * Some factorized error routines *
  55. * *
  56. ************************************************************************/
  57. /**
  58. * htmlErrMemory:
  59. * @ctxt: an HTML parser context
  60. * @extra: extra informations
  61. *
  62. * Handle a redefinition of attribute error
  63. */
  64. static void
  65. htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
  66. {
  67. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  68. (ctxt->instate == XML_PARSER_EOF))
  69. return;
  70. if (ctxt != NULL) {
  71. ctxt->errNo = XML_ERR_NO_MEMORY;
  72. ctxt->instate = XML_PARSER_EOF;
  73. ctxt->disableSAX = 1;
  74. }
  75. if (extra)
  76. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  77. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
  78. NULL, NULL, 0, 0,
  79. "Memory allocation failed : %s\n", extra);
  80. else
  81. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
  82. XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
  83. NULL, NULL, 0, 0, "Memory allocation failed\n");
  84. }
  85. /**
  86. * htmlParseErr:
  87. * @ctxt: an HTML parser context
  88. * @error: the error number
  89. * @msg: the error message
  90. * @str1: string infor
  91. * @str2: string infor
  92. *
  93. * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  94. */
  95. static void
  96. htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  97. const char *msg, const xmlChar *str1, const xmlChar *str2)
  98. {
  99. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  100. (ctxt->instate == XML_PARSER_EOF))
  101. return;
  102. if (ctxt != NULL)
  103. ctxt->errNo = error;
  104. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
  105. XML_ERR_ERROR, NULL, 0,
  106. (const char *) str1, (const char *) str2,
  107. NULL, 0, 0,
  108. msg, str1, str2);
  109. if (ctxt != NULL)
  110. ctxt->wellFormed = 0;
  111. }
  112. /**
  113. * htmlParseErrInt:
  114. * @ctxt: an HTML parser context
  115. * @error: the error number
  116. * @msg: the error message
  117. * @val: integer info
  118. *
  119. * Handle a fatal parser error, i.e. violating Well-Formedness constraints
  120. */
  121. static void
  122. htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
  123. const char *msg, int val)
  124. {
  125. if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
  126. (ctxt->instate == XML_PARSER_EOF))
  127. return;
  128. if (ctxt != NULL)
  129. ctxt->errNo = error;
  130. __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
  131. XML_ERR_ERROR, NULL, 0, NULL, NULL,
  132. NULL, val, 0, msg, val);
  133. if (ctxt != NULL)
  134. ctxt->wellFormed = 0;
  135. }
  136. /************************************************************************
  137. * *
  138. * Parser stacks related functions and macros *
  139. * *
  140. ************************************************************************/
  141. /**
  142. * htmlnamePush:
  143. * @ctxt: an HTML parser context
  144. * @value: the element name
  145. *
  146. * Pushes a new element name on top of the name stack
  147. *
  148. * Returns 0 in case of error, the index in the stack otherwise
  149. */
  150. static int
  151. htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
  152. {
  153. if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
  154. ctxt->html = 3;
  155. if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
  156. ctxt->html = 10;
  157. if (ctxt->nameNr >= ctxt->nameMax) {
  158. ctxt->nameMax *= 2;
  159. ctxt->nameTab = (const xmlChar * *)
  160. xmlRealloc((xmlChar * *)ctxt->nameTab,
  161. ctxt->nameMax *
  162. sizeof(ctxt->nameTab[0]));
  163. if (ctxt->nameTab == NULL) {
  164. htmlErrMemory(ctxt, NULL);
  165. return (0);
  166. }
  167. }
  168. ctxt->nameTab[ctxt->nameNr] = value;
  169. ctxt->name = value;
  170. return (ctxt->nameNr++);
  171. }
  172. /**
  173. * htmlnamePop:
  174. * @ctxt: an HTML parser context
  175. *
  176. * Pops the top element name from the name stack
  177. *
  178. * Returns the name just removed
  179. */
  180. static const xmlChar *
  181. htmlnamePop(htmlParserCtxtPtr ctxt)
  182. {
  183. const xmlChar *ret;
  184. if (ctxt->nameNr <= 0)
  185. return (NULL);
  186. ctxt->nameNr--;
  187. if (ctxt->nameNr < 0)
  188. return (NULL);
  189. if (ctxt->nameNr > 0)
  190. ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
  191. else
  192. ctxt->name = NULL;
  193. ret = ctxt->nameTab[ctxt->nameNr];
  194. ctxt->nameTab[ctxt->nameNr] = NULL;
  195. return (ret);
  196. }
  197. /*
  198. * Macros for accessing the content. Those should be used only by the parser,
  199. * and not exported.
  200. *
  201. * Dirty macros, i.e. one need to make assumption on the context to use them
  202. *
  203. * CUR_PTR return the current pointer to the xmlChar to be parsed.
  204. * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
  205. * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
  206. * in UNICODE mode. This should be used internally by the parser
  207. * only to compare to ASCII values otherwise it would break when
  208. * running with UTF-8 encoding.
  209. * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
  210. * to compare on ASCII based substring.
  211. * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
  212. * it should be used only to compare on ASCII based substring.
  213. * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
  214. * strings without newlines within the parser.
  215. *
  216. * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
  217. *
  218. * CURRENT Returns the current char value, with the full decoding of
  219. * UTF-8 if we are using this mode. It returns an int.
  220. * NEXT Skip to the next character, this does the proper decoding
  221. * in UTF-8 mode. It also pop-up unfinished entities on the fly.
  222. * NEXTL(l) Skip the current unicode character of l xmlChars long.
  223. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
  224. */
  225. #define UPPER (toupper(*ctxt->input->cur))
  226. #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
  227. #define NXT(val) ctxt->input->cur[(val)]
  228. #define UPP(val) (toupper(ctxt->input->cur[(val)]))
  229. #define CUR_PTR ctxt->input->cur
  230. #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
  231. (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
  232. xmlParserInputShrink(ctxt->input)
  233. #define GROW if ((ctxt->progressive == 0) && \
  234. (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
  235. xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
  236. #define CURRENT ((int) (*ctxt->input->cur))
  237. #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
  238. /* Inported from XML */
  239. /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
  240. #define CUR ((int) (*ctxt->input->cur))
  241. #define NEXT xmlNextChar(ctxt)
  242. #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
  243. #define NXT(val) ctxt->input->cur[(val)]
  244. #define CUR_PTR ctxt->input->cur
  245. #define NEXTL(l) do { \
  246. if (*(ctxt->input->cur) == '\n') { \
  247. ctxt->input->line++; ctxt->input->col = 1; \
  248. } else ctxt->input->col++; \
  249. ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
  250. } while (0)
  251. /************
  252. \
  253. if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
  254. if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
  255. ************/
  256. #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
  257. #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
  258. #define COPY_BUF(l,b,i,v) \
  259. if (l == 1) b[i++] = (xmlChar) v; \
  260. else i += xmlCopyChar(l,&b[i],v)
  261. /**
  262. * htmlFindEncoding:
  263. * @the HTML parser context
  264. *
  265. * Ty to find and encoding in the current data available in the input
  266. * buffer this is needed to try to switch to the proper encoding when
  267. * one face a character error.
  268. * That's an heuristic, since it's operating outside of parsing it could
  269. * try to use a meta which had been commented out, that's the reason it
  270. * should only be used in case of error, not as a default.
  271. *
  272. * Returns an encoding string or NULL if not found, the string need to
  273. * be freed
  274. */
  275. static xmlChar *
  276. htmlFindEncoding(xmlParserCtxtPtr ctxt) {
  277. const xmlChar *start, *cur, *end;
  278. if ((ctxt == NULL) || (ctxt->input == NULL) ||
  279. (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
  280. (ctxt->input->buf->encoder != NULL))
  281. return(NULL);
  282. if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
  283. return(NULL);
  284. start = ctxt->input->cur;
  285. end = ctxt->input->end;
  286. /* we also expect the input buffer to be zero terminated */
  287. if (*end != 0)
  288. return(NULL);
  289. cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
  290. if (cur == NULL)
  291. return(NULL);
  292. cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
  293. if (cur == NULL)
  294. return(NULL);
  295. cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
  296. if (cur == NULL)
  297. return(NULL);
  298. cur += 8;
  299. start = cur;
  300. while (((*cur >= 'A') && (*cur <= 'Z')) ||
  301. ((*cur >= 'a') && (*cur <= 'z')) ||
  302. ((*cur >= '0') && (*cur <= '9')) ||
  303. (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
  304. cur++;
  305. if (cur == start)
  306. return(NULL);
  307. return(xmlStrndup(start, cur - start));
  308. }
  309. /**
  310. * htmlCurrentChar:
  311. * @ctxt: the HTML parser context
  312. * @len: pointer to the length of the char read
  313. *
  314. * The current char value, if using UTF-8 this may actually span multiple
  315. * bytes in the input buffer. Implement the end of line normalization:
  316. * 2.11 End-of-Line Handling
  317. * If the encoding is unspecified, in the case we find an ISO-Latin-1
  318. * char, then the encoding converter is plugged in automatically.
  319. *
  320. * Returns the current char value and its length
  321. */
  322. static int
  323. htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
  324. if (ctxt->instate == XML_PARSER_EOF)
  325. return(0);
  326. if (ctxt->token != 0) {
  327. *len = 0;
  328. return(ctxt->token);
  329. }
  330. if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
  331. /*
  332. * We are supposed to handle UTF8, check it's valid
  333. * From rfc2044: encoding of the Unicode values on UTF-8:
  334. *
  335. * UCS-4 range (hex.) UTF-8 octet sequence (binary)
  336. * 0000 0000-0000 007F 0xxxxxxx
  337. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
  338. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
  339. *
  340. * Check for the 0x110000 limit too
  341. */
  342. const unsigned char *cur = ctxt->input->cur;
  343. unsigned char c;
  344. unsigned int val;
  345. c = *cur;
  346. if (c & 0x80) {
  347. if (cur[1] == 0) {
  348. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  349. cur = ctxt->input->cur;
  350. }
  351. if ((cur[1] & 0xc0) != 0x80)
  352. goto encoding_error;
  353. if ((c & 0xe0) == 0xe0) {
  354. if (cur[2] == 0) {
  355. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  356. cur = ctxt->input->cur;
  357. }
  358. if ((cur[2] & 0xc0) != 0x80)
  359. goto encoding_error;
  360. if ((c & 0xf0) == 0xf0) {
  361. if (cur[3] == 0) {
  362. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  363. cur = ctxt->input->cur;
  364. }
  365. if (((c & 0xf8) != 0xf0) ||
  366. ((cur[3] & 0xc0) != 0x80))
  367. goto encoding_error;
  368. /* 4-byte code */
  369. *len = 4;
  370. val = (cur[0] & 0x7) << 18;
  371. val |= (cur[1] & 0x3f) << 12;
  372. val |= (cur[2] & 0x3f) << 6;
  373. val |= cur[3] & 0x3f;
  374. } else {
  375. /* 3-byte code */
  376. *len = 3;
  377. val = (cur[0] & 0xf) << 12;
  378. val |= (cur[1] & 0x3f) << 6;
  379. val |= cur[2] & 0x3f;
  380. }
  381. } else {
  382. /* 2-byte code */
  383. *len = 2;
  384. val = (cur[0] & 0x1f) << 6;
  385. val |= cur[1] & 0x3f;
  386. }
  387. if (!IS_CHAR(val)) {
  388. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  389. "Char 0x%X out of allowed range\n", val);
  390. }
  391. return(val);
  392. } else {
  393. if ((*ctxt->input->cur == 0) &&
  394. (ctxt->input->cur < ctxt->input->end)) {
  395. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  396. "Char 0x%X out of allowed range\n", 0);
  397. *len = 1;
  398. return(' ');
  399. }
  400. /* 1-byte code */
  401. *len = 1;
  402. return((int) *ctxt->input->cur);
  403. }
  404. }
  405. /*
  406. * Assume it's a fixed length encoding (1) with
  407. * a compatible encoding for the ASCII set, since
  408. * XML constructs only use < 128 chars
  409. */
  410. *len = 1;
  411. if ((int) *ctxt->input->cur < 0x80)
  412. return((int) *ctxt->input->cur);
  413. /*
  414. * Humm this is bad, do an automatic flow conversion
  415. */
  416. {
  417. xmlChar * guess;
  418. xmlCharEncodingHandlerPtr handler;
  419. guess = htmlFindEncoding(ctxt);
  420. if (guess == NULL) {
  421. xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
  422. } else {
  423. if (ctxt->input->encoding != NULL)
  424. xmlFree((xmlChar *) ctxt->input->encoding);
  425. ctxt->input->encoding = guess;
  426. handler = xmlFindCharEncodingHandler((const char *) guess);
  427. if (handler != NULL) {
  428. xmlSwitchToEncoding(ctxt, handler);
  429. } else {
  430. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  431. "Unsupported encoding %s", guess, NULL);
  432. }
  433. }
  434. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  435. }
  436. return(xmlCurrentChar(ctxt, len));
  437. encoding_error:
  438. /*
  439. * If we detect an UTF8 error that probably mean that the
  440. * input encoding didn't get properly advertized in the
  441. * declaration header. Report the error and switch the encoding
  442. * to ISO-Latin-1 (if you don't like this policy, just declare the
  443. * encoding !)
  444. */
  445. {
  446. char buffer[150];
  447. if (ctxt->input->end - ctxt->input->cur >= 4) {
  448. snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
  449. ctxt->input->cur[0], ctxt->input->cur[1],
  450. ctxt->input->cur[2], ctxt->input->cur[3]);
  451. } else {
  452. snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
  453. }
  454. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  455. "Input is not proper UTF-8, indicate encoding !\n",
  456. BAD_CAST buffer, NULL);
  457. }
  458. ctxt->charset = XML_CHAR_ENCODING_8859_1;
  459. *len = 1;
  460. return((int) *ctxt->input->cur);
  461. }
  462. /**
  463. * htmlSkipBlankChars:
  464. * @ctxt: the HTML parser context
  465. *
  466. * skip all blanks character found at that point in the input streams.
  467. *
  468. * Returns the number of space chars skipped
  469. */
  470. static int
  471. htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
  472. int res = 0;
  473. while (IS_BLANK_CH(*(ctxt->input->cur))) {
  474. if ((*ctxt->input->cur == 0) &&
  475. (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
  476. xmlPopInput(ctxt);
  477. } else {
  478. if (*(ctxt->input->cur) == '\n') {
  479. ctxt->input->line++; ctxt->input->col = 1;
  480. } else ctxt->input->col++;
  481. ctxt->input->cur++;
  482. ctxt->nbChars++;
  483. if (*ctxt->input->cur == 0)
  484. xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
  485. }
  486. res++;
  487. }
  488. return(res);
  489. }
  490. /************************************************************************
  491. * *
  492. * The list of HTML elements and their properties *
  493. * *
  494. ************************************************************************/
  495. /*
  496. * Start Tag: 1 means the start tag can be ommited
  497. * End Tag: 1 means the end tag can be ommited
  498. * 2 means it's forbidden (empty elements)
  499. * 3 means the tag is stylistic and should be closed easily
  500. * Depr: this element is deprecated
  501. * DTD: 1 means that this element is valid only in the Loose DTD
  502. * 2 means that this element is valid only in the Frameset DTD
  503. *
  504. * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
  505. , subElements , impliedsubelt , Attributes, userdata
  506. */
  507. /* Definitions and a couple of vars for HTML Elements */
  508. #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
  509. #define NB_FONTSTYLE 8
  510. #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
  511. #define NB_PHRASE 10
  512. #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
  513. #define NB_SPECIAL 16
  514. #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
  515. #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
  516. #define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
  517. #define NB_BLOCK NB_HEADING + NB_LIST + 14
  518. #define FORMCTRL "input", "select", "textarea", "label", "button"
  519. #define NB_FORMCTRL 5
  520. #define PCDATA
  521. #define NB_PCDATA 0
  522. #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
  523. #define NB_HEADING 6
  524. #define LIST "ul", "ol", "dir", "menu"
  525. #define NB_LIST 4
  526. #define MODIFIER
  527. #define NB_MODIFIER 0
  528. #define FLOW BLOCK,INLINE
  529. #define NB_FLOW NB_BLOCK + NB_INLINE
  530. #define EMPTY NULL
  531. static const char* const html_flow[] = { FLOW, NULL } ;
  532. static const char* const html_inline[] = { INLINE, NULL } ;
  533. /* placeholders: elts with content but no subelements */
  534. static const char* const html_pcdata[] = { NULL } ;
  535. #define html_cdata html_pcdata
  536. /* ... and for HTML Attributes */
  537. #define COREATTRS "id", "class", "style", "title"
  538. #define NB_COREATTRS 4
  539. #define I18N "lang", "dir"
  540. #define NB_I18N 2
  541. #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
  542. #define NB_EVENTS 9
  543. #define ATTRS COREATTRS,I18N,EVENTS
  544. #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
  545. #define CELLHALIGN "align", "char", "charoff"
  546. #define NB_CELLHALIGN 3
  547. #define CELLVALIGN "valign"
  548. #define NB_CELLVALIGN 1
  549. static const char* const html_attrs[] = { ATTRS, NULL } ;
  550. static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
  551. static const char* const core_attrs[] = { COREATTRS, NULL } ;
  552. static const char* const i18n_attrs[] = { I18N, NULL } ;
  553. /* Other declarations that should go inline ... */
  554. static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
  555. "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
  556. "tabindex", "onfocus", "onblur", NULL } ;
  557. static const char* const target_attr[] = { "target", NULL } ;
  558. static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
  559. static const char* const alt_attr[] = { "alt", NULL } ;
  560. static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
  561. static const char* const href_attrs[] = { "href", NULL } ;
  562. static const char* const clear_attrs[] = { "clear", NULL } ;
  563. static const char* const inline_p[] = { INLINE, "p", NULL } ;
  564. static const char* const flow_param[] = { FLOW, "param", NULL } ;
  565. static const char* const applet_attrs[] = { COREATTRS , "codebase",
  566. "archive", "alt", "name", "height", "width", "align",
  567. "hspace", "vspace", NULL } ;
  568. static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
  569. "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
  570. static const char* const basefont_attrs[] =
  571. { "id", "size", "color", "face", NULL } ;
  572. static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
  573. static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
  574. static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
  575. static const char* const body_depr[] = { "background", "bgcolor", "text",
  576. "link", "vlink", "alink", NULL } ;
  577. static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
  578. "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
  579. static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
  580. static const char* const col_elt[] = { "col", NULL } ;
  581. static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
  582. static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
  583. static const char* const dl_contents[] = { "dt", "dd", NULL } ;
  584. static const char* const compact_attr[] = { "compact", NULL } ;
  585. static const char* const label_attr[] = { "label", NULL } ;
  586. static const char* const fieldset_contents[] = { FLOW, "legend" } ;
  587. static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
  588. static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
  589. static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
  590. static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
  591. static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
  592. static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
  593. static const char* const head_attrs[] = { I18N, "profile", NULL } ;
  594. static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
  595. static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
  596. static const char* const version_attr[] = { "version", NULL } ;
  597. static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
  598. static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
  599. static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
  600. static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
  601. static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
  602. static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
  603. static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
  604. static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
  605. static const char* const align_attr[] = { "align", NULL } ;
  606. static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
  607. static const char* const map_contents[] = { BLOCK, "area", NULL } ;
  608. static const char* const name_attr[] = { "name", NULL } ;
  609. static const char* const action_attr[] = { "action", NULL } ;
  610. static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
  611. static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
  612. static const char* const content_attr[] = { "content", NULL } ;
  613. static const char* const type_attr[] = { "type", NULL } ;
  614. static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
  615. static const char* const object_contents[] = { FLOW, "param", NULL } ;
  616. static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
  617. static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
  618. static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
  619. static const char* const option_elt[] = { "option", NULL } ;
  620. static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
  621. static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
  622. static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
  623. static const char* const width_attr[] = { "width", NULL } ;
  624. static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
  625. static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
  626. static const char* const language_attr[] = { "language", NULL } ;
  627. static const char* const select_content[] = { "optgroup", "option", NULL } ;
  628. static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
  629. static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
  630. static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
  631. static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
  632. static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
  633. static const char* const tr_elt[] = { "tr", NULL } ;
  634. static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
  635. static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
  636. static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
  637. static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
  638. static const char* const tr_contents[] = { "th", "td", NULL } ;
  639. static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
  640. static const char* const li_elt[] = { "li", NULL } ;
  641. static const char* const ul_depr[] = { "type", "compact", NULL} ;
  642. static const char* const dir_attr[] = { "dir", NULL} ;
  643. #define DECL (const char**)
  644. static const htmlElemDesc
  645. html40ElementTable[] = {
  646. { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
  647. DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
  648. },
  649. { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
  650. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  651. },
  652. { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
  653. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  654. },
  655. { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
  656. DECL inline_p , NULL , DECL html_attrs, NULL, NULL
  657. },
  658. { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
  659. DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
  660. },
  661. { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
  662. EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
  663. },
  664. { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
  665. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  666. },
  667. { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
  668. EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
  669. },
  670. { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
  671. EMPTY , NULL , NULL, DECL basefont_attrs, NULL
  672. },
  673. { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
  674. DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
  675. },
  676. { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
  677. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  678. },
  679. { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
  680. DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
  681. },
  682. { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
  683. DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
  684. },
  685. { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
  686. EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
  687. },
  688. { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
  689. DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
  690. },
  691. { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
  692. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  693. },
  694. { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
  695. DECL html_flow , NULL , NULL, DECL html_attrs, NULL
  696. },
  697. { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
  698. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  699. },
  700. { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
  701. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  702. },
  703. { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
  704. EMPTY , NULL , DECL col_attrs , NULL, NULL
  705. },
  706. { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
  707. DECL col_elt , "col" , DECL col_attrs , NULL, NULL
  708. },
  709. { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
  710. DECL html_flow , NULL , DECL html_attrs, NULL, NULL
  711. },
  712. { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
  713. DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
  714. },
  715. { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
  716. DECL html_inline , NULL , DECL html_attrs, NULL, NULL
  717. },
  718. { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
  719. DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
  720. },
  721. { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
  722. DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
  723. },
  724. { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
  725. DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
  726. },
  727. { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
  728. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  729. },
  730. { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
  731. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  732. },
  733. { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
  734. EMPTY, NULL, DECL embed_attrs, NULL, NULL
  735. },
  736. { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
  737. DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
  738. },
  739. { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
  740. DECL html_inline, NULL, NULL, DECL font_attrs, NULL
  741. },
  742. { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
  743. DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
  744. },
  745. { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
  746. EMPTY, NULL, NULL, DECL frame_attrs, NULL
  747. },
  748. { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
  749. DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
  750. },
  751. { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
  752. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  753. },
  754. { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
  755. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  756. },
  757. { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
  758. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  759. },
  760. { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
  761. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  762. },
  763. { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
  764. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  765. },
  766. { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
  767. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  768. },
  769. { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
  770. DECL head_contents, NULL, DECL head_attrs, NULL, NULL
  771. },
  772. { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
  773. EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
  774. },
  775. { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
  776. DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
  777. },
  778. { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
  779. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  780. },
  781. { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
  782. DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
  783. },
  784. { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
  785. EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
  786. },
  787. { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
  788. EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
  789. },
  790. { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
  791. DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
  792. },
  793. { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
  794. EMPTY, NULL, NULL, DECL prompt_attrs, NULL
  795. },
  796. { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
  797. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  798. },
  799. { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
  800. DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
  801. },
  802. { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
  803. DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
  804. },
  805. { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
  806. DECL html_flow, NULL, DECL html_attrs, NULL, NULL
  807. },
  808. { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
  809. EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
  810. },
  811. { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
  812. DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
  813. },
  814. { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
  815. DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
  816. },
  817. { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
  818. EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
  819. },
  820. { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
  821. DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
  822. },
  823. { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
  824. DECL html_flow, "div", DECL html_attrs, NULL, NULL
  825. },
  826. { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
  827. DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
  828. },
  829. { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
  830. DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
  831. },
  832. { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
  833. DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
  834. },
  835. { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
  836. DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
  837. },
  838. { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
  839. DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
  840. },
  841. { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
  842. EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
  843. },
  844. { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
  845. DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
  846. },
  847. { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
  848. DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
  849. },
  850. { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
  851. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  852. },
  853. { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
  854. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  855. },
  856. { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
  857. DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
  858. },
  859. { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
  860. DECL select_content, NULL, DECL select_attrs, NULL, NULL
  861. },
  862. { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
  863. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  864. },
  865. { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
  866. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  867. },
  868. { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
  869. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  870. },
  871. { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
  872. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  873. },
  874. { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
  875. DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
  876. },
  877. { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
  878. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  879. },
  880. { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
  881. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  882. },
  883. { "table", 0, 0, 0, 0, 0, 0, 0, "",
  884. DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
  885. },
  886. { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
  887. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  888. },
  889. { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
  890. DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
  891. },
  892. { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
  893. DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
  894. },
  895. { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
  896. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  897. },
  898. { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
  899. DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
  900. },
  901. { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
  902. DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
  903. },
  904. { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
  905. DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
  906. },
  907. { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
  908. DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
  909. },
  910. { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
  911. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  912. },
  913. { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
  914. DECL html_inline, NULL, NULL, DECL html_attrs, NULL
  915. },
  916. { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
  917. DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
  918. },
  919. { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
  920. DECL html_inline, NULL, DECL html_attrs, NULL, NULL
  921. }
  922. };
  923. /*
  924. * start tags that imply the end of current element
  925. */
  926. static const char * const htmlStartClose[] = {
  927. "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
  928. "dl", "ul", "ol", "menu", "dir", "address", "pre",
  929. "listing", "xmp", "head", NULL,
  930. "head", "p", NULL,
  931. "title", "p", NULL,
  932. "body", "head", "style", "link", "title", "p", NULL,
  933. "frameset", "head", "style", "link", "title", "p", NULL,
  934. "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
  935. "pre", "listing", "xmp", "head", "li", NULL,
  936. "hr", "p", "head", NULL,
  937. "h1", "p", "head", NULL,
  938. "h2", "p", "head", NULL,
  939. "h3", "p", "head", NULL,
  940. "h4", "p", "head", NULL,
  941. "h5", "p", "head", NULL,
  942. "h6", "p", "head", NULL,
  943. "dir", "p", "head", NULL,
  944. "address", "p", "head", "ul", NULL,
  945. "pre", "p", "head", "ul", NULL,
  946. "listing", "p", "head", NULL,
  947. "xmp", "p", "head", NULL,
  948. "blockquote", "p", "head", NULL,
  949. "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
  950. "xmp", "head", NULL,
  951. "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
  952. "head", "dd", NULL,
  953. "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
  954. "head", "dt", NULL,
  955. "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
  956. "listing", "xmp", NULL,
  957. "ol", "p", "head", "ul", NULL,
  958. "menu", "p", "head", "ul", NULL,
  959. "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
  960. "div", "p", "head", NULL,
  961. "noscript", "p", "head", NULL,
  962. "center", "font", "b", "i", "p", "head", NULL,
  963. "a", "a", NULL,
  964. "caption", "p", NULL,
  965. "colgroup", "caption", "colgroup", "col", "p", NULL,
  966. "col", "caption", "col", "p", NULL,
  967. "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
  968. "listing", "xmp", "a", NULL,
  969. "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
  970. "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
  971. "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
  972. "thead", "caption", "col", "colgroup", NULL,
  973. "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
  974. "tbody", "p", NULL,
  975. "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
  976. "tfoot", "tbody", "p", NULL,
  977. "optgroup", "option", NULL,
  978. "option", "option", NULL,
  979. "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
  980. "pre", "listing", "xmp", "a", NULL,
  981. NULL
  982. };
  983. /*
  984. * The list of HTML elements which are supposed not to have
  985. * CDATA content and where a p element will be implied
  986. *
  987. * TODO: extend that list by reading the HTML SGML DTD on
  988. * implied paragraph
  989. */
  990. static const char *const htmlNoContentElements[] = {
  991. "html",
  992. "head",
  993. NULL
  994. };
  995. /*
  996. * The list of HTML attributes which are of content %Script;
  997. * NOTE: when adding ones, check htmlIsScriptAttribute() since
  998. * it assumes the name starts with 'on'
  999. */
  1000. static const char *const htmlScriptAttributes[] = {
  1001. "onclick",
  1002. "ondblclick",
  1003. "onmousedown",
  1004. "onmouseup",
  1005. "onmouseover",
  1006. "onmousemove",
  1007. "onmouseout",
  1008. "onkeypress",
  1009. "onkeydown",
  1010. "onkeyup",
  1011. "onload",
  1012. "onunload",
  1013. "onfocus",
  1014. "onblur",
  1015. "onsubmit",
  1016. "onrest",
  1017. "onchange",
  1018. "onselect"
  1019. };
  1020. /*
  1021. * This table is used by the htmlparser to know what to do with
  1022. * broken html pages. By assigning different priorities to different
  1023. * elements the parser can decide how to handle extra endtags.
  1024. * Endtags are only allowed to close elements with lower or equal
  1025. * priority.
  1026. */
  1027. typedef struct {
  1028. const char *name;
  1029. int priority;
  1030. } elementPriority;
  1031. static const elementPriority htmlEndPriority[] = {
  1032. {"div", 150},
  1033. {"td", 160},
  1034. {"th", 160},
  1035. {"tr", 170},
  1036. {"thead", 180},
  1037. {"tbody", 180},
  1038. {"tfoot", 180},
  1039. {"table", 190},
  1040. {"head", 200},
  1041. {"body", 200},
  1042. {"html", 220},
  1043. {NULL, 100} /* Default priority */
  1044. };
  1045. static const char** htmlStartCloseIndex[100];
  1046. static int htmlStartCloseIndexinitialized = 0;
  1047. /************************************************************************
  1048. * *
  1049. * functions to handle HTML specific data *
  1050. * *
  1051. ************************************************************************/
  1052. /**
  1053. * htmlInitAutoClose:
  1054. *
  1055. * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
  1056. * This is not reentrant. Call xmlInitParser() once before processing in
  1057. * case of use in multithreaded programs.
  1058. */
  1059. void
  1060. htmlInitAutoClose(void) {
  1061. int indx, i = 0;
  1062. if (htmlStartCloseIndexinitialized) return;
  1063. for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
  1064. indx = 0;
  1065. while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
  1066. htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
  1067. while (htmlStartClose[i] != NULL) i++;
  1068. i++;
  1069. }
  1070. htmlStartCloseIndexinitialized = 1;
  1071. }
  1072. /**
  1073. * htmlTagLookup:
  1074. * @tag: The tag name in lowercase
  1075. *
  1076. * Lookup the HTML tag in the ElementTable
  1077. *
  1078. * Returns the related htmlElemDescPtr or NULL if not found.
  1079. */
  1080. const htmlElemDesc *
  1081. htmlTagLookup(const xmlChar *tag) {
  1082. unsigned int i;
  1083. for (i = 0; i < (sizeof(html40ElementTable) /
  1084. sizeof(html40ElementTable[0]));i++) {
  1085. if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
  1086. return((htmlElemDescPtr) &html40ElementTable[i]);
  1087. }
  1088. return(NULL);
  1089. }
  1090. /**
  1091. * htmlGetEndPriority:
  1092. * @name: The name of the element to look up the priority for.
  1093. *
  1094. * Return value: The "endtag" priority.
  1095. **/
  1096. static int
  1097. htmlGetEndPriority (const xmlChar *name) {
  1098. int i = 0;
  1099. while ((htmlEndPriority[i].name != NULL) &&
  1100. (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
  1101. i++;
  1102. return(htmlEndPriority[i].priority);
  1103. }
  1104. /**
  1105. * htmlCheckAutoClose:
  1106. * @newtag: The new tag name
  1107. * @oldtag: The old tag name
  1108. *
  1109. * Checks whether the new tag is one of the registered valid tags for
  1110. * closing old.
  1111. * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
  1112. *
  1113. * Returns 0 if no, 1 if yes.
  1114. */
  1115. static int
  1116. htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
  1117. {
  1118. int i, indx;
  1119. const char **closed = NULL;
  1120. if (htmlStartCloseIndexinitialized == 0)
  1121. htmlInitAutoClose();
  1122. /* inefficient, but not a big deal */
  1123. for (indx = 0; indx < 100; indx++) {
  1124. closed = htmlStartCloseIndex[indx];
  1125. if (closed == NULL)
  1126. return (0);
  1127. if (xmlStrEqual(BAD_CAST * closed, newtag))
  1128. break;
  1129. }
  1130. i = closed - htmlStartClose;
  1131. i++;
  1132. while (htmlStartClose[i] != NULL) {
  1133. if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
  1134. return (1);
  1135. }
  1136. i++;
  1137. }
  1138. return (0);
  1139. }
  1140. /**
  1141. * htmlAutoCloseOnClose:
  1142. * @ctxt: an HTML parser context
  1143. * @newtag: The new tag name
  1144. * @force: force the tag closure
  1145. *
  1146. * The HTML DTD allows an ending tag to implicitly close other tags.
  1147. */
  1148. static void
  1149. htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
  1150. {
  1151. const htmlElemDesc *info;
  1152. int i, priority;
  1153. priority = htmlGetEndPriority(newtag);
  1154. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  1155. if (xmlStrEqual(newtag, ctxt->nameTab[i]))
  1156. break;
  1157. /*
  1158. * A missplaced endtag can only close elements with lower
  1159. * or equal priority, so if we find an element with higher
  1160. * priority before we find an element with
  1161. * matching name, we just ignore this endtag
  1162. */
  1163. if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
  1164. return;
  1165. }
  1166. if (i < 0)
  1167. return;
  1168. while (!xmlStrEqual(newtag, ctxt->name)) {
  1169. info = htmlTagLookup(ctxt->name);
  1170. if ((info != NULL) && (info->endTag == 3)) {
  1171. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  1172. "Opening and ending tag mismatch: %s and %s\n",
  1173. newtag, ctxt->name);
  1174. }
  1175. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1176. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1177. htmlnamePop(ctxt);
  1178. }
  1179. }
  1180. /**
  1181. * htmlAutoCloseOnEnd:
  1182. * @ctxt: an HTML parser context
  1183. *
  1184. * Close all remaining tags at the end of the stream
  1185. */
  1186. static void
  1187. htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
  1188. {
  1189. int i;
  1190. if (ctxt->nameNr == 0)
  1191. return;
  1192. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  1193. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1194. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1195. htmlnamePop(ctxt);
  1196. }
  1197. }
  1198. /**
  1199. * htmlAutoClose:
  1200. * @ctxt: an HTML parser context
  1201. * @newtag: The new tag name or NULL
  1202. *
  1203. * The HTML DTD allows a tag to implicitly close other tags.
  1204. * The list is kept in htmlStartClose array. This function is
  1205. * called when a new tag has been detected and generates the
  1206. * appropriates closes if possible/needed.
  1207. * If newtag is NULL this mean we are at the end of the resource
  1208. * and we should check
  1209. */
  1210. static void
  1211. htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
  1212. {
  1213. while ((newtag != NULL) && (ctxt->name != NULL) &&
  1214. (htmlCheckAutoClose(newtag, ctxt->name))) {
  1215. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1216. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1217. htmlnamePop(ctxt);
  1218. }
  1219. if (newtag == NULL) {
  1220. htmlAutoCloseOnEnd(ctxt);
  1221. return;
  1222. }
  1223. while ((newtag == NULL) && (ctxt->name != NULL) &&
  1224. ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
  1225. (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
  1226. (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
  1227. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  1228. ctxt->sax->endElement(ctxt->userData, ctxt->name);
  1229. htmlnamePop(ctxt);
  1230. }
  1231. }
  1232. /**
  1233. * htmlAutoCloseTag:
  1234. * @doc: the HTML document
  1235. * @name: The tag name
  1236. * @elem: the HTML element
  1237. *
  1238. * The HTML DTD allows a tag to implicitly close other tags.
  1239. * The list is kept in htmlStartClose array. This function checks
  1240. * if the element or one of it's children would autoclose the
  1241. * given tag.
  1242. *
  1243. * Returns 1 if autoclose, 0 otherwise
  1244. */
  1245. int
  1246. htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
  1247. htmlNodePtr child;
  1248. if (elem == NULL) return(1);
  1249. if (xmlStrEqual(name, elem->name)) return(0);
  1250. if (htmlCheckAutoClose(elem->name, name)) return(1);
  1251. child = elem->children;
  1252. while (child != NULL) {
  1253. if (htmlAutoCloseTag(doc, name, child)) return(1);
  1254. child = child->next;
  1255. }
  1256. return(0);
  1257. }
  1258. /**
  1259. * htmlIsAutoClosed:
  1260. * @doc: the HTML document
  1261. * @elem: the HTML element
  1262. *
  1263. * The HTML DTD allows a tag to implicitly close other tags.
  1264. * The list is kept in htmlStartClose array. This function checks
  1265. * if a tag is autoclosed by one of it's child
  1266. *
  1267. * Returns 1 if autoclosed, 0 otherwise
  1268. */
  1269. int
  1270. htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
  1271. htmlNodePtr child;
  1272. if (elem == NULL) return(1);
  1273. child = elem->children;
  1274. while (child != NULL) {
  1275. if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
  1276. child = child->next;
  1277. }
  1278. return(0);
  1279. }
  1280. /**
  1281. * htmlCheckImplied:
  1282. * @ctxt: an HTML parser context
  1283. * @newtag: The new tag name
  1284. *
  1285. * The HTML DTD allows a tag to exists only implicitly
  1286. * called when a new tag has been detected and generates the
  1287. * appropriates implicit tags if missing
  1288. */
  1289. static void
  1290. htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
  1291. int i;
  1292. if (!htmlOmittedDefaultValue)
  1293. return;
  1294. if (xmlStrEqual(newtag, BAD_CAST"html"))
  1295. return;
  1296. if (ctxt->nameNr <= 0) {
  1297. htmlnamePush(ctxt, BAD_CAST"html");
  1298. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1299. ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
  1300. }
  1301. if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
  1302. return;
  1303. if ((ctxt->nameNr <= 1) &&
  1304. ((xmlStrEqual(newtag, BAD_CAST"script")) ||
  1305. (xmlStrEqual(newtag, BAD_CAST"style")) ||
  1306. (xmlStrEqual(newtag, BAD_CAST"meta")) ||
  1307. (xmlStrEqual(newtag, BAD_CAST"link")) ||
  1308. (xmlStrEqual(newtag, BAD_CAST"title")) ||
  1309. (xmlStrEqual(newtag, BAD_CAST"base")))) {
  1310. if (ctxt->html >= 3) {
  1311. /* we already saw or generated an <head> before */
  1312. return;
  1313. }
  1314. /*
  1315. * dropped OBJECT ... i you put it first BODY will be
  1316. * assumed !
  1317. */
  1318. htmlnamePush(ctxt, BAD_CAST"head");
  1319. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1320. ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
  1321. } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
  1322. (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
  1323. (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
  1324. if (ctxt->html >= 10) {
  1325. /* we already saw or generated a <body> before */
  1326. return;
  1327. }
  1328. for (i = 0;i < ctxt->nameNr;i++) {
  1329. if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
  1330. return;
  1331. }
  1332. if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
  1333. return;
  1334. }
  1335. }
  1336. htmlnamePush(ctxt, BAD_CAST"body");
  1337. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1338. ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
  1339. }
  1340. }
  1341. /**
  1342. * htmlCheckParagraph
  1343. * @ctxt: an HTML parser context
  1344. *
  1345. * Check whether a p element need to be implied before inserting
  1346. * characters in the current element.
  1347. *
  1348. * Returns 1 if a paragraph has been inserted, 0 if not and -1
  1349. * in case of error.
  1350. */
  1351. static int
  1352. htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
  1353. const xmlChar *tag;
  1354. int i;
  1355. if (ctxt == NULL)
  1356. return(-1);
  1357. tag = ctxt->name;
  1358. if (tag == NULL) {
  1359. htmlAutoClose(ctxt, BAD_CAST"p");
  1360. htmlCheckImplied(ctxt, BAD_CAST"p");
  1361. htmlnamePush(ctxt, BAD_CAST"p");
  1362. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1363. ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
  1364. return(1);
  1365. }
  1366. if (!htmlOmittedDefaultValue)
  1367. return(0);
  1368. for (i = 0; htmlNoContentElements[i] != NULL; i++) {
  1369. if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
  1370. htmlAutoClose(ctxt, BAD_CAST"p");
  1371. htmlCheckImplied(ctxt, BAD_CAST"p");
  1372. htmlnamePush(ctxt, BAD_CAST"p");
  1373. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
  1374. ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
  1375. return(1);
  1376. }
  1377. }
  1378. return(0);
  1379. }
  1380. /**
  1381. * htmlIsScriptAttribute:
  1382. * @name: an attribute name
  1383. *
  1384. * Check if an attribute is of content type Script
  1385. *
  1386. * Returns 1 is the attribute is a script 0 otherwise
  1387. */
  1388. int
  1389. htmlIsScriptAttribute(const xmlChar *name) {
  1390. unsigned int i;
  1391. if (name == NULL)
  1392. return(0);
  1393. /*
  1394. * all script attributes start with 'on'
  1395. */
  1396. if ((name[0] != 'o') || (name[1] != 'n'))
  1397. return(0);
  1398. for (i = 0;
  1399. i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
  1400. i++) {
  1401. if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
  1402. return(1);
  1403. }
  1404. return(0);
  1405. }
  1406. /************************************************************************
  1407. * *
  1408. * The list of HTML predefined entities *
  1409. * *
  1410. ************************************************************************/
  1411. static const htmlEntityDesc html40EntitiesTable[] = {
  1412. /*
  1413. * the 4 absolute ones, plus apostrophe.
  1414. */
  1415. { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
  1416. { 38, "amp", "ampersand, U+0026 ISOnum" },
  1417. { 39, "apos", "single quote" },
  1418. { 60, "lt", "less-than sign, U+003C ISOnum" },
  1419. { 62, "gt", "greater-than sign, U+003E ISOnum" },
  1420. /*
  1421. * A bunch still in the 128-255 range
  1422. * Replacing them depend really on the charset used.
  1423. */
  1424. { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
  1425. { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
  1426. { 162, "cent", "cent sign, U+00A2 ISOnum" },
  1427. { 163, "pound","pound sign, U+00A3 ISOnum" },
  1428. { 164, "curren","currency sign, U+00A4 ISOnum" },
  1429. { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
  1430. { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
  1431. { 167, "sect", "section sign, U+00A7 ISOnum" },
  1432. { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
  1433. { 169, "copy", "copyright sign, U+00A9 ISOnum" },
  1434. { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
  1435. { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
  1436. { 172, "not", "not sign, U+00AC ISOnum" },
  1437. { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
  1438. { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
  1439. { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
  1440. { 176, "deg", "degree sign, U+00B0 ISOnum" },
  1441. { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
  1442. { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
  1443. { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
  1444. { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
  1445. { 181, "micro","micro sign, U+00B5 ISOnum" },
  1446. { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
  1447. { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
  1448. { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
  1449. { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
  1450. { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
  1451. { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
  1452. { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
  1453. { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
  1454. { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
  1455. { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
  1456. { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
  1457. { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
  1458. { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
  1459. { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
  1460. { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
  1461. { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
  1462. { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
  1463. { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
  1464. { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
  1465. { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
  1466. { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
  1467. { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
  1468. { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
  1469. { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
  1470. { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
  1471. { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
  1472. { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
  1473. { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
  1474. { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
  1475. { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
  1476. { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
  1477. { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
  1478. { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
  1479. { 215, "times","multiplication sign, U+00D7 ISOnum" },
  1480. { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
  1481. { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
  1482. { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
  1483. { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
  1484. { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
  1485. { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
  1486. { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
  1487. { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
  1488. { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
  1489. { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
  1490. { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
  1491. { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
  1492. { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
  1493. { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
  1494. { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
  1495. { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
  1496. { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
  1497. { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
  1498. { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
  1499. { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
  1500. { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
  1501. { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
  1502. { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
  1503. { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
  1504. { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
  1505. { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
  1506. { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
  1507. { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
  1508. { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
  1509. { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
  1510. { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
  1511. { 247, "divide","division sign, U+00F7 ISOnum" },
  1512. { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
  1513. { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
  1514. { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
  1515. { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
  1516. { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
  1517. { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
  1518. { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
  1519. { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
  1520. { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
  1521. { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
  1522. { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
  1523. { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
  1524. { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
  1525. /*
  1526. * Anything below should really be kept as entities references
  1527. */
  1528. { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
  1529. { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
  1530. { 732, "tilde","small tilde, U+02DC ISOdia" },
  1531. { 913, "Alpha","greek capital letter alpha, U+0391" },
  1532. { 914, "Beta", "greek capital letter beta, U+0392" },
  1533. { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
  1534. { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
  1535. { 917, "Epsilon","greek capital letter epsilon, U+0395" },
  1536. { 918, "Zeta", "greek capital letter zeta, U+0396" },
  1537. { 919, "Eta", "greek capital letter eta, U+0397" },
  1538. { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
  1539. { 921, "Iota", "greek capital letter iota, U+0399" },
  1540. { 922, "Kappa","greek capital letter kappa, U+039A" },
  1541. { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
  1542. { 924, "Mu", "greek capital letter mu, U+039C" },
  1543. { 925, "Nu", "greek capital letter nu, U+039D" },
  1544. { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
  1545. { 927, "Omicron","greek capital letter omicron, U+039F" },
  1546. { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
  1547. { 929, "Rho", "greek capital letter rho, U+03A1" },
  1548. { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
  1549. { 932, "Tau", "greek capital letter tau, U+03A4" },
  1550. { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
  1551. { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
  1552. { 935, "Chi", "greek capital letter chi, U+03A7" },
  1553. { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
  1554. { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
  1555. { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
  1556. { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
  1557. { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
  1558. { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
  1559. { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
  1560. { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
  1561. { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
  1562. { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
  1563. { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
  1564. { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
  1565. { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
  1566. { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
  1567. { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
  1568. { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
  1569. { 959, "omicron","greek small letter omicron, U+03BF NEW" },
  1570. { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
  1571. { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
  1572. { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
  1573. { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
  1574. { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
  1575. { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
  1576. { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
  1577. { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
  1578. { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
  1579. { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
  1580. { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
  1581. { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
  1582. { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
  1583. { 8194, "ensp", "en space, U+2002 ISOpub" },
  1584. { 8195, "emsp", "em space, U+2003 ISOpub" },
  1585. { 8201, "thinsp","thin space, U+2009 ISOpub" },
  1586. { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
  1587. { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
  1588. { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
  1589. { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
  1590. { 8211, "ndash","en dash, U+2013 ISOpub" },
  1591. { 8212, "mdash","em dash, U+2014 ISOpub" },
  1592. { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
  1593. { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
  1594. { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
  1595. { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
  1596. { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
  1597. { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
  1598. { 8224, "dagger","dagger, U+2020 ISOpub" },
  1599. { 8225, "Dagger","double dagger, U+2021 ISOpub" },
  1600. { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
  1601. { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
  1602. { 8240, "permil","per mille sign, U+2030 ISOtech" },
  1603. { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
  1604. { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
  1605. { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
  1606. { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
  1607. { 8254, "oline","overline = spacing overscore, U+203E NEW" },
  1608. { 8260, "frasl","fraction slash, U+2044 NEW" },
  1609. { 8364, "euro", "euro sign, U+20AC NEW" },
  1610. { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
  1611. { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
  1612. { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
  1613. { 8482, "trade","trade mark sign, U+2122 ISOnum" },
  1614. { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
  1615. { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
  1616. { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
  1617. { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
  1618. { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
  1619. { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
  1620. { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
  1621. { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
  1622. { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
  1623. { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
  1624. { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
  1625. { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
  1626. { 8704, "forall","for all, U+2200 ISOtech" },
  1627. { 8706, "part", "partial differential, U+2202 ISOtech" },
  1628. { 8707, "exist","there exists, U+2203 ISOtech" },
  1629. { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
  1630. { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
  1631. { 8712, "isin", "element of, U+2208 ISOtech" },
  1632. { 8713, "notin","not an element of, U+2209 ISOtech" },
  1633. { 8715, "ni", "contains as member, U+220B ISOtech" },
  1634. { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
  1635. { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
  1636. { 8722, "minus","minus sign, U+2212 ISOtech" },
  1637. { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
  1638. { 8730, "radic","square root = radical sign, U+221A ISOtech" },
  1639. { 8733, "prop", "proportional to, U+221D ISOtech" },
  1640. { 8734, "infin","infinity, U+221E ISOtech" },
  1641. { 8736, "ang", "angle, U+2220 ISOamso" },
  1642. { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
  1643. { 8744, "or", "logical or = vee, U+2228 ISOtech" },
  1644. { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
  1645. { 8746, "cup", "union = cup, U+222A ISOtech" },
  1646. { 8747, "int", "integral, U+222B ISOtech" },
  1647. { 8756, "there4","therefore, U+2234 ISOtech" },
  1648. { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
  1649. { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
  1650. { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
  1651. { 8800, "ne", "not equal to, U+2260 ISOtech" },
  1652. { 8801, "equiv","identical to, U+2261 ISOtech" },
  1653. { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
  1654. { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
  1655. { 8834, "sub", "subset of, U+2282 ISOtech" },
  1656. { 8835, "sup", "superset of, U+2283 ISOtech" },
  1657. { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
  1658. { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
  1659. { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
  1660. { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
  1661. { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
  1662. { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
  1663. { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
  1664. { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
  1665. { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
  1666. { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
  1667. { 8971, "rfloor","right floor, U+230B ISOamsc" },
  1668. { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
  1669. { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
  1670. { 9674, "loz", "lozenge, U+25CA ISOpub" },
  1671. { 9824, "spades","black spade suit, U+2660 ISOpub" },
  1672. { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
  1673. { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
  1674. { 9830, "diams","black diamond suit, U+2666 ISOpub" },
  1675. };
  1676. /************************************************************************
  1677. * *
  1678. * Commodity functions to handle entities *
  1679. * *
  1680. ************************************************************************/
  1681. /*
  1682. * Macro used to grow the current buffer.
  1683. */
  1684. #define growBuffer(buffer) { \
  1685. xmlChar *tmp; \
  1686. buffer##_size *= 2; \
  1687. tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
  1688. if (tmp == NULL) { \
  1689. htmlErrMemory(ctxt, "growing buffer\n"); \
  1690. xmlFree(buffer); \
  1691. return(NULL); \
  1692. } \
  1693. buffer = tmp; \
  1694. }
  1695. /**
  1696. * htmlEntityLookup:
  1697. * @name: the entity name
  1698. *
  1699. * Lookup the given entity in EntitiesTable
  1700. *
  1701. * TODO: the linear scan is really ugly, an hash table is really needed.
  1702. *
  1703. * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
  1704. */
  1705. const htmlEntityDesc *
  1706. htmlEntityLookup(const xmlChar *name) {
  1707. unsigned int i;
  1708. for (i = 0;i < (sizeof(html40EntitiesTable)/
  1709. sizeof(html40EntitiesTable[0]));i++) {
  1710. if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
  1711. return((htmlEntityDescPtr) &html40EntitiesTable[i]);
  1712. }
  1713. }
  1714. return(NULL);
  1715. }
  1716. /**
  1717. * htmlEntityValueLookup:
  1718. * @value: the entity's unicode value
  1719. *
  1720. * Lookup the given entity in EntitiesTable
  1721. *
  1722. * TODO: the linear scan is really ugly, an hash table is really needed.
  1723. *
  1724. * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
  1725. */
  1726. const htmlEntityDesc *
  1727. htmlEntityValueLookup(unsigned int value) {
  1728. unsigned int i;
  1729. for (i = 0;i < (sizeof(html40EntitiesTable)/
  1730. sizeof(html40EntitiesTable[0]));i++) {
  1731. if (html40EntitiesTable[i].value >= value) {
  1732. if (html40EntitiesTable[i].value > value)
  1733. break;
  1734. return((htmlEntityDescPtr) &html40EntitiesTable[i]);
  1735. }
  1736. }
  1737. return(NULL);
  1738. }
  1739. /**
  1740. * UTF8ToHtml:
  1741. * @out: a pointer to an array of bytes to store the result
  1742. * @outlen: the length of @out
  1743. * @in: a pointer to an array of UTF-8 chars
  1744. * @inlen: the length of @in
  1745. *
  1746. * Take a block of UTF-8 chars in and try to convert it to an ASCII
  1747. * plus HTML entities block of chars out.
  1748. *
  1749. * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
  1750. * The value of @inlen after return is the number of octets consumed
  1751. * as the return value is positive, else unpredictable.
  1752. * The value of @outlen after return is the number of octets consumed.
  1753. */
  1754. int
  1755. UTF8ToHtml(unsigned char* out, int *outlen,
  1756. const unsigned char* in, int *inlen) {
  1757. const unsigned char* processed = in;
  1758. const unsigned char* outend;
  1759. const unsigned char* outstart = out;
  1760. const unsigned char* instart = in;
  1761. const unsigned char* inend;
  1762. unsigned int c, d;
  1763. int trailing;
  1764. if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
  1765. if (in == NULL) {
  1766. /*
  1767. * initialization nothing to do
  1768. */
  1769. *outlen = 0;
  1770. *inlen = 0;
  1771. return(0);
  1772. }
  1773. inend = in + (*inlen);
  1774. outend = out + (*outlen);
  1775. while (in < inend) {
  1776. d = *in++;
  1777. if (d < 0x80) { c= d; trailing= 0; }
  1778. else if (d < 0xC0) {
  1779. /* trailing byte in leading position */
  1780. *outlen = out - outstart;
  1781. *inlen = processed - instart;
  1782. return(-2);
  1783. } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
  1784. else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
  1785. else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
  1786. else {
  1787. /* no chance for this in Ascii */
  1788. *outlen = out - outstart;
  1789. *inlen = processed - instart;
  1790. return(-2);
  1791. }
  1792. if (inend - in < trailing) {
  1793. break;
  1794. }
  1795. for ( ; trailing; trailing--) {
  1796. if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
  1797. break;
  1798. c <<= 6;
  1799. c |= d & 0x3F;
  1800. }
  1801. /* assertion: c is a single UTF-4 value */
  1802. if (c < 0x80) {
  1803. if (out + 1 >= outend)
  1804. break;
  1805. *out++ = c;
  1806. } else {
  1807. int len;
  1808. const htmlEntityDesc * ent;
  1809. const char *cp;
  1810. char nbuf[16];
  1811. /*
  1812. * Try to lookup a predefined HTML entity for it
  1813. */
  1814. ent = htmlEntityValueLookup(c);
  1815. if (ent == NULL) {
  1816. snprintf(nbuf, sizeof(nbuf), "#%u", c);
  1817. cp = nbuf;
  1818. }
  1819. else
  1820. cp = ent->name;
  1821. len = strlen(cp);
  1822. if (out + 2 + len >= outend)
  1823. break;
  1824. *out++ = '&';
  1825. memcpy(out, cp, len);
  1826. out += len;
  1827. *out++ = ';';
  1828. }
  1829. processed = in;
  1830. }
  1831. *outlen = out - outstart;
  1832. *inlen = processed - instart;
  1833. return(0);
  1834. }
  1835. /**
  1836. * htmlEncodeEntities:
  1837. * @out: a pointer to an array of bytes to store the result
  1838. * @outlen: the length of @out
  1839. * @in: a pointer to an array of UTF-8 chars
  1840. * @inlen: the length of @in
  1841. * @quoteChar: the quote character to escape (' or ") or zero.
  1842. *
  1843. * Take a block of UTF-8 chars in and try to convert it to an ASCII
  1844. * plus HTML entities block of chars out.
  1845. *
  1846. * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
  1847. * The value of @inlen after return is the number of octets consumed
  1848. * as the return value is positive, else unpredictable.
  1849. * The value of @outlen after return is the number of octets consumed.
  1850. */
  1851. int
  1852. htmlEncodeEntities(unsigned char* out, int *outlen,
  1853. const unsigned char* in, int *inlen, int quoteChar) {
  1854. const unsigned char* processed = in;
  1855. const unsigned char* outend;
  1856. const unsigned char* outstart = out;
  1857. const unsigned char* instart = in;
  1858. const unsigned char* inend;
  1859. unsigned int c, d;
  1860. int trailing;
  1861. if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
  1862. return(-1);
  1863. outend = out + (*outlen);
  1864. inend = in + (*inlen);
  1865. while (in < inend) {
  1866. d = *in++;
  1867. if (d < 0x80) { c= d; trailing= 0; }
  1868. else if (d < 0xC0) {
  1869. /* trailing byte in leading position */
  1870. *outlen = out - outstart;
  1871. *inlen = processed - instart;
  1872. return(-2);
  1873. } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
  1874. else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
  1875. else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
  1876. else {
  1877. /* no chance for this in Ascii */
  1878. *outlen = out - outstart;
  1879. *inlen = processed - instart;
  1880. return(-2);
  1881. }
  1882. if (inend - in < trailing)
  1883. break;
  1884. while (trailing--) {
  1885. if (((d= *in++) & 0xC0) != 0x80) {
  1886. *outlen = out - outstart;
  1887. *inlen = processed - instart;
  1888. return(-2);
  1889. }
  1890. c <<= 6;
  1891. c |= d & 0x3F;
  1892. }
  1893. /* assertion: c is a single UTF-4 value */
  1894. if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
  1895. (c != '&') && (c != '<') && (c != '>')) {
  1896. if (out >= outend)
  1897. break;
  1898. *out++ = c;
  1899. } else {
  1900. const htmlEntityDesc * ent;
  1901. const char *cp;
  1902. char nbuf[16];
  1903. int len;
  1904. /*
  1905. * Try to lookup a predefined HTML entity for it
  1906. */
  1907. ent = htmlEntityValueLookup(c);
  1908. if (ent == NULL) {
  1909. snprintf(nbuf, sizeof(nbuf), "#%u", c);
  1910. cp = nbuf;
  1911. }
  1912. else
  1913. cp = ent->name;
  1914. len = strlen(cp);
  1915. if (out + 2 + len > outend)
  1916. break;
  1917. *out++ = '&';
  1918. memcpy(out, cp, len);
  1919. out += len;
  1920. *out++ = ';';
  1921. }
  1922. processed = in;
  1923. }
  1924. *outlen = out - outstart;
  1925. *inlen = processed - instart;
  1926. return(0);
  1927. }
  1928. /************************************************************************
  1929. * *
  1930. * Commodity functions to handle streams *
  1931. * *
  1932. ************************************************************************/
  1933. /**
  1934. * htmlNewInputStream:
  1935. * @ctxt: an HTML parser context
  1936. *
  1937. * Create a new input stream structure
  1938. * Returns the new input stream or NULL
  1939. */
  1940. static htmlParserInputPtr
  1941. htmlNewInputStream(htmlParserCtxtPtr ctxt) {
  1942. htmlParserInputPtr input;
  1943. input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
  1944. if (input == NULL) {
  1945. htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
  1946. return(NULL);
  1947. }
  1948. memset(input, 0, sizeof(htmlParserInput));
  1949. input->filename = NULL;
  1950. input->directory = NULL;
  1951. input->base = NULL;
  1952. input->cur = NULL;
  1953. input->buf = NULL;
  1954. input->line = 1;
  1955. input->col = 1;
  1956. input->buf = NULL;
  1957. input->free = NULL;
  1958. input->version = NULL;
  1959. input->consumed = 0;
  1960. input->length = 0;
  1961. return(input);
  1962. }
  1963. /************************************************************************
  1964. * *
  1965. * Commodity functions, cleanup needed ? *
  1966. * *
  1967. ************************************************************************/
  1968. /*
  1969. * all tags allowing pc data from the html 4.01 loose dtd
  1970. * NOTE: it might be more apropriate to integrate this information
  1971. * into the html40ElementTable array but I don't want to risk any
  1972. * binary incomptibility
  1973. */
  1974. static const char *allowPCData[] = {
  1975. "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
  1976. "blockquote", "body", "button", "caption", "center", "cite", "code",
  1977. "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
  1978. "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
  1979. "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
  1980. "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
  1981. };
  1982. /**
  1983. * areBlanks:
  1984. * @ctxt: an HTML parser context
  1985. * @str: a xmlChar *
  1986. * @len: the size of @str
  1987. *
  1988. * Is this a sequence of blank chars that one can ignore ?
  1989. *
  1990. * Returns 1 if ignorable 0 otherwise.
  1991. */
  1992. static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
  1993. unsigned int i;
  1994. int j;
  1995. xmlNodePtr lastChild;
  1996. xmlDtdPtr dtd;
  1997. for (j = 0;j < len;j++)
  1998. if (!(IS_BLANK_CH(str[j]))) return(0);
  1999. if (CUR == 0) return(1);
  2000. if (CUR != '<') return(0);
  2001. if (ctxt->name == NULL)
  2002. return(1);
  2003. if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
  2004. return(1);
  2005. if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
  2006. return(1);
  2007. /* Only strip CDATA children of the body tag for strict HTML DTDs */
  2008. if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
  2009. dtd = xmlGetIntSubset(ctxt->myDoc);
  2010. if (dtd != NULL && dtd->ExternalID != NULL) {
  2011. if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
  2012. !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
  2013. return(1);
  2014. }
  2015. }
  2016. if (ctxt->node == NULL) return(0);
  2017. lastChild = xmlGetLastChild(ctxt->node);
  2018. while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
  2019. lastChild = lastChild->prev;
  2020. if (lastChild == NULL) {
  2021. if ((ctxt->node->type != XML_ELEMENT_NODE) &&
  2022. (ctxt->node->content != NULL)) return(0);
  2023. /* keep ws in constructs like ...<b> </b>...
  2024. for all tags "b" allowing PCDATA */
  2025. for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
  2026. if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
  2027. return(0);
  2028. }
  2029. }
  2030. } else if (xmlNodeIsText(lastChild)) {
  2031. return(0);
  2032. } else {
  2033. /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
  2034. for all tags "p" allowing PCDATA */
  2035. for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
  2036. if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
  2037. return(0);
  2038. }
  2039. }
  2040. }
  2041. return(1);
  2042. }
  2043. /**
  2044. * htmlNewDocNoDtD:
  2045. * @URI: URI for the dtd, or NULL
  2046. * @ExternalID: the external ID of the DTD, or NULL
  2047. *
  2048. * Creates a new HTML document without a DTD node if @URI and @ExternalID
  2049. * are NULL
  2050. *
  2051. * Returns a new document, do not initialize the DTD if not provided
  2052. */
  2053. htmlDocPtr
  2054. htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
  2055. xmlDocPtr cur;
  2056. /*
  2057. * Allocate a new document and fill the fields.
  2058. */
  2059. cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
  2060. if (cur == NULL) {
  2061. htmlErrMemory(NULL, "HTML document creation failed\n");
  2062. return(NULL);
  2063. }
  2064. memset(cur, 0, sizeof(xmlDoc));
  2065. cur->type = XML_HTML_DOCUMENT_NODE;
  2066. cur->version = NULL;
  2067. cur->intSubset = NULL;
  2068. cur->doc = cur;
  2069. cur->name = NULL;
  2070. cur->children = NULL;
  2071. cur->extSubset = NULL;
  2072. cur->oldNs = NULL;
  2073. cur->encoding = NULL;
  2074. cur->standalone = 1;
  2075. cur->compression = 0;
  2076. cur->ids = NULL;
  2077. cur->refs = NULL;
  2078. cur->_private = NULL;
  2079. cur->charset = XML_CHAR_ENCODING_UTF8;
  2080. cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
  2081. if ((ExternalID != NULL) ||
  2082. (URI != NULL))
  2083. xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
  2084. return(cur);
  2085. }
  2086. /**
  2087. * htmlNewDoc:
  2088. * @URI: URI for the dtd, or NULL
  2089. * @ExternalID: the external ID of the DTD, or NULL
  2090. *
  2091. * Creates a new HTML document
  2092. *
  2093. * Returns a new document
  2094. */
  2095. htmlDocPtr
  2096. htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
  2097. if ((URI == NULL) && (ExternalID == NULL))
  2098. return(htmlNewDocNoDtD(
  2099. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
  2100. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
  2101. return(htmlNewDocNoDtD(URI, ExternalID));
  2102. }
  2103. /************************************************************************
  2104. * *
  2105. * The parser itself *
  2106. * Relates to http://www.w3.org/TR/html40 *
  2107. * *
  2108. ************************************************************************/
  2109. /************************************************************************
  2110. * *
  2111. * The parser itself *
  2112. * *
  2113. ************************************************************************/
  2114. static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
  2115. /**
  2116. * htmlParseHTMLName:
  2117. * @ctxt: an HTML parser context
  2118. *
  2119. * parse an HTML tag or attribute name, note that we convert it to lowercase
  2120. * since HTML names are not case-sensitive.
  2121. *
  2122. * Returns the Tag Name parsed or NULL
  2123. */
  2124. static const xmlChar *
  2125. htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
  2126. int i = 0;
  2127. xmlChar loc[HTML_PARSER_BUFFER_SIZE];
  2128. if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
  2129. (CUR != ':') && (CUR != '.')) return(NULL);
  2130. while ((i < HTML_PARSER_BUFFER_SIZE) &&
  2131. ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
  2132. (CUR == ':') || (CUR == '-') || (CUR == '_') ||
  2133. (CUR == '.'))) {
  2134. if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
  2135. else loc[i] = CUR;
  2136. i++;
  2137. NEXT;
  2138. }
  2139. return(xmlDictLookup(ctxt->dict, loc, i));
  2140. }
  2141. /**
  2142. * htmlParseHTMLName_nonInvasive:
  2143. * @ctxt: an HTML parser context
  2144. *
  2145. * parse an HTML tag or attribute name, note that we convert it to lowercase
  2146. * since HTML names are not case-sensitive, this doesn't consume the data
  2147. * from the stream, it's a look-ahead
  2148. *
  2149. * Returns the Tag Name parsed or NULL
  2150. */
  2151. static const xmlChar *
  2152. htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
  2153. int i = 0;
  2154. xmlChar loc[HTML_PARSER_BUFFER_SIZE];
  2155. if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
  2156. (NXT(1) != ':')) return(NULL);
  2157. while ((i < HTML_PARSER_BUFFER_SIZE) &&
  2158. ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
  2159. (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
  2160. if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
  2161. else loc[i] = NXT(1+i);
  2162. i++;
  2163. }
  2164. return(xmlDictLookup(ctxt->dict, loc, i));
  2165. }
  2166. /**
  2167. * htmlParseName:
  2168. * @ctxt: an HTML parser context
  2169. *
  2170. * parse an HTML name, this routine is case sensitive.
  2171. *
  2172. * Returns the Name parsed or NULL
  2173. */
  2174. static const xmlChar *
  2175. htmlParseName(htmlParserCtxtPtr ctxt) {
  2176. const xmlChar *in;
  2177. const xmlChar *ret;
  2178. int count = 0;
  2179. GROW;
  2180. /*
  2181. * Accelerator for simple ASCII names
  2182. */
  2183. in = ctxt->input->cur;
  2184. if (((*in >= 0x61) && (*in <= 0x7A)) ||
  2185. ((*in >= 0x41) && (*in <= 0x5A)) ||
  2186. (*in == '_') || (*in == ':')) {
  2187. in++;
  2188. while (((*in >= 0x61) && (*in <= 0x7A)) ||
  2189. ((*in >= 0x41) && (*in <= 0x5A)) ||
  2190. ((*in >= 0x30) && (*in <= 0x39)) ||
  2191. (*in == '_') || (*in == '-') ||
  2192. (*in == ':') || (*in == '.'))
  2193. in++;
  2194. if ((*in > 0) && (*in < 0x80)) {
  2195. count = in - ctxt->input->cur;
  2196. ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
  2197. ctxt->input->cur = in;
  2198. ctxt->nbChars += count;
  2199. ctxt->input->col += count;
  2200. return(ret);
  2201. }
  2202. }
  2203. return(htmlParseNameComplex(ctxt));
  2204. }
  2205. static const xmlChar *
  2206. htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
  2207. int len = 0, l;
  2208. int c;
  2209. int count = 0;
  2210. /*
  2211. * Handler for more complex cases
  2212. */
  2213. GROW;
  2214. c = CUR_CHAR(l);
  2215. if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
  2216. (!IS_LETTER(c) && (c != '_') &&
  2217. (c != ':'))) {
  2218. return(NULL);
  2219. }
  2220. while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
  2221. ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
  2222. (c == '.') || (c == '-') ||
  2223. (c == '_') || (c == ':') ||
  2224. (IS_COMBINING(c)) ||
  2225. (IS_EXTENDER(c)))) {
  2226. if (count++ > 100) {
  2227. count = 0;
  2228. GROW;
  2229. }
  2230. len += l;
  2231. NEXTL(l);
  2232. c = CUR_CHAR(l);
  2233. }
  2234. return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
  2235. }
  2236. /**
  2237. * htmlParseHTMLAttribute:
  2238. * @ctxt: an HTML parser context
  2239. * @stop: a char stop value
  2240. *
  2241. * parse an HTML attribute value till the stop (quote), if
  2242. * stop is 0 then it stops at the first space
  2243. *
  2244. * Returns the attribute parsed or NULL
  2245. */
  2246. static xmlChar *
  2247. htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
  2248. xmlChar *buffer = NULL;
  2249. int buffer_size = 0;
  2250. xmlChar *out = NULL;
  2251. const xmlChar *name = NULL;
  2252. const xmlChar *cur = NULL;
  2253. const htmlEntityDesc * ent;
  2254. /*
  2255. * allocate a translation buffer.
  2256. */
  2257. buffer_size = HTML_PARSER_BUFFER_SIZE;
  2258. buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
  2259. if (buffer == NULL) {
  2260. htmlErrMemory(ctxt, "buffer allocation failed\n");
  2261. return(NULL);
  2262. }
  2263. out = buffer;
  2264. /*
  2265. * Ok loop until we reach one of the ending chars
  2266. */
  2267. while ((CUR != 0) && (CUR != stop)) {
  2268. if ((stop == 0) && (CUR == '>')) break;
  2269. if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
  2270. if (CUR == '&') {
  2271. if (NXT(1) == '#') {
  2272. unsigned int c;
  2273. int bits;
  2274. c = htmlParseCharRef(ctxt);
  2275. if (c < 0x80)
  2276. { *out++ = c; bits= -6; }
  2277. else if (c < 0x800)
  2278. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2279. else if (c < 0x10000)
  2280. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2281. else
  2282. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2283. for ( ; bits >= 0; bits-= 6) {
  2284. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2285. }
  2286. if (out - buffer > buffer_size - 100) {
  2287. int indx = out - buffer;
  2288. growBuffer(buffer);
  2289. out = &buffer[indx];
  2290. }
  2291. } else {
  2292. ent = htmlParseEntityRef(ctxt, &name);
  2293. if (name == NULL) {
  2294. *out++ = '&';
  2295. if (out - buffer > buffer_size - 100) {
  2296. int indx = out - buffer;
  2297. growBuffer(buffer);
  2298. out = &buffer[indx];
  2299. }
  2300. } else if (ent == NULL) {
  2301. *out++ = '&';
  2302. cur = name;
  2303. while (*cur != 0) {
  2304. if (out - buffer > buffer_size - 100) {
  2305. int indx = out - buffer;
  2306. growBuffer(buffer);
  2307. out = &buffer[indx];
  2308. }
  2309. *out++ = *cur++;
  2310. }
  2311. } else {
  2312. unsigned int c;
  2313. int bits;
  2314. if (out - buffer > buffer_size - 100) {
  2315. int indx = out - buffer;
  2316. growBuffer(buffer);
  2317. out = &buffer[indx];
  2318. }
  2319. c = ent->value;
  2320. if (c < 0x80)
  2321. { *out++ = c; bits= -6; }
  2322. else if (c < 0x800)
  2323. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2324. else if (c < 0x10000)
  2325. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2326. else
  2327. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2328. for ( ; bits >= 0; bits-= 6) {
  2329. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2330. }
  2331. }
  2332. }
  2333. } else {
  2334. unsigned int c;
  2335. int bits, l;
  2336. if (out - buffer > buffer_size - 100) {
  2337. int indx = out - buffer;
  2338. growBuffer(buffer);
  2339. out = &buffer[indx];
  2340. }
  2341. c = CUR_CHAR(l);
  2342. if (c < 0x80)
  2343. { *out++ = c; bits= -6; }
  2344. else if (c < 0x800)
  2345. { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  2346. else if (c < 0x10000)
  2347. { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  2348. else
  2349. { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
  2350. for ( ; bits >= 0; bits-= 6) {
  2351. *out++ = ((c >> bits) & 0x3F) | 0x80;
  2352. }
  2353. NEXT;
  2354. }
  2355. }
  2356. *out = 0;
  2357. return(buffer);
  2358. }
  2359. /**
  2360. * htmlParseEntityRef:
  2361. * @ctxt: an HTML parser context
  2362. * @str: location to store the entity name
  2363. *
  2364. * parse an HTML ENTITY references
  2365. *
  2366. * [68] EntityRef ::= '&' Name ';'
  2367. *
  2368. * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
  2369. * if non-NULL *str will have to be freed by the caller.
  2370. */
  2371. const htmlEntityDesc *
  2372. htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
  2373. const xmlChar *name;
  2374. const htmlEntityDesc * ent = NULL;
  2375. if (str != NULL) *str = NULL;
  2376. if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
  2377. if (CUR == '&') {
  2378. NEXT;
  2379. name = htmlParseName(ctxt);
  2380. if (name == NULL) {
  2381. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  2382. "htmlParseEntityRef: no name\n", NULL, NULL);
  2383. } else {
  2384. GROW;
  2385. if (CUR == ';') {
  2386. if (str != NULL)
  2387. *str = name;
  2388. /*
  2389. * Lookup the entity in the table.
  2390. */
  2391. ent = htmlEntityLookup(name);
  2392. if (ent != NULL) /* OK that's ugly !!! */
  2393. NEXT;
  2394. } else {
  2395. htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
  2396. "htmlParseEntityRef: expecting ';'\n",
  2397. NULL, NULL);
  2398. if (str != NULL)
  2399. *str = name;
  2400. }
  2401. }
  2402. }
  2403. return(ent);
  2404. }
  2405. /**
  2406. * htmlParseAttValue:
  2407. * @ctxt: an HTML parser context
  2408. *
  2409. * parse a value for an attribute
  2410. * Note: the parser won't do substitution of entities here, this
  2411. * will be handled later in xmlStringGetNodeList, unless it was
  2412. * asked for ctxt->replaceEntities != 0
  2413. *
  2414. * Returns the AttValue parsed or NULL.
  2415. */
  2416. static xmlChar *
  2417. htmlParseAttValue(htmlParserCtxtPtr ctxt) {
  2418. xmlChar *ret = NULL;
  2419. if (CUR == '"') {
  2420. NEXT;
  2421. ret = htmlParseHTMLAttribute(ctxt, '"');
  2422. if (CUR != '"') {
  2423. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
  2424. "AttValue: \" expected\n", NULL, NULL);
  2425. } else
  2426. NEXT;
  2427. } else if (CUR == '\'') {
  2428. NEXT;
  2429. ret = htmlParseHTMLAttribute(ctxt, '\'');
  2430. if (CUR != '\'') {
  2431. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
  2432. "AttValue: ' expected\n", NULL, NULL);
  2433. } else
  2434. NEXT;
  2435. } else {
  2436. /*
  2437. * That's an HTMLism, the attribute value may not be quoted
  2438. */
  2439. ret = htmlParseHTMLAttribute(ctxt, 0);
  2440. if (ret == NULL) {
  2441. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
  2442. "AttValue: no value found\n", NULL, NULL);
  2443. }
  2444. }
  2445. return(ret);
  2446. }
  2447. /**
  2448. * htmlParseSystemLiteral:
  2449. * @ctxt: an HTML parser context
  2450. *
  2451. * parse an HTML Literal
  2452. *
  2453. * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
  2454. *
  2455. * Returns the SystemLiteral parsed or NULL
  2456. */
  2457. static xmlChar *
  2458. htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
  2459. const xmlChar *q;
  2460. xmlChar *ret = NULL;
  2461. if (CUR == '"') {
  2462. NEXT;
  2463. q = CUR_PTR;
  2464. while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
  2465. NEXT;
  2466. if (!IS_CHAR_CH(CUR)) {
  2467. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2468. "Unfinished SystemLiteral\n", NULL, NULL);
  2469. } else {
  2470. ret = xmlStrndup(q, CUR_PTR - q);
  2471. NEXT;
  2472. }
  2473. } else if (CUR == '\'') {
  2474. NEXT;
  2475. q = CUR_PTR;
  2476. while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
  2477. NEXT;
  2478. if (!IS_CHAR_CH(CUR)) {
  2479. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2480. "Unfinished SystemLiteral\n", NULL, NULL);
  2481. } else {
  2482. ret = xmlStrndup(q, CUR_PTR - q);
  2483. NEXT;
  2484. }
  2485. } else {
  2486. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
  2487. " or ' expected\n", NULL, NULL);
  2488. }
  2489. return(ret);
  2490. }
  2491. /**
  2492. * htmlParsePubidLiteral:
  2493. * @ctxt: an HTML parser context
  2494. *
  2495. * parse an HTML public literal
  2496. *
  2497. * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
  2498. *
  2499. * Returns the PubidLiteral parsed or NULL.
  2500. */
  2501. static xmlChar *
  2502. htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
  2503. const xmlChar *q;
  2504. xmlChar *ret = NULL;
  2505. /*
  2506. * Name ::= (Letter | '_') (NameChar)*
  2507. */
  2508. if (CUR == '"') {
  2509. NEXT;
  2510. q = CUR_PTR;
  2511. while (IS_PUBIDCHAR_CH(CUR)) NEXT;
  2512. if (CUR != '"') {
  2513. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2514. "Unfinished PubidLiteral\n", NULL, NULL);
  2515. } else {
  2516. ret = xmlStrndup(q, CUR_PTR - q);
  2517. NEXT;
  2518. }
  2519. } else if (CUR == '\'') {
  2520. NEXT;
  2521. q = CUR_PTR;
  2522. while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
  2523. NEXT;
  2524. if (CUR != '\'') {
  2525. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
  2526. "Unfinished PubidLiteral\n", NULL, NULL);
  2527. } else {
  2528. ret = xmlStrndup(q, CUR_PTR - q);
  2529. NEXT;
  2530. }
  2531. } else {
  2532. htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
  2533. "PubidLiteral \" or ' expected\n", NULL, NULL);
  2534. }
  2535. return(ret);
  2536. }
  2537. /**
  2538. * htmlParseScript:
  2539. * @ctxt: an HTML parser context
  2540. *
  2541. * parse the content of an HTML SCRIPT or STYLE element
  2542. * http://www.w3.org/TR/html4/sgml/dtd.html#Script
  2543. * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
  2544. * http://www.w3.org/TR/html4/types.html#type-script
  2545. * http://www.w3.org/TR/html4/types.html#h-6.15
  2546. * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
  2547. *
  2548. * Script data ( %Script; in the DTD) can be the content of the SCRIPT
  2549. * element and the value of intrinsic event attributes. User agents must
  2550. * not evaluate script data as HTML markup but instead must pass it on as
  2551. * data to a script engine.
  2552. * NOTES:
  2553. * - The content is passed like CDATA
  2554. * - the attributes for style and scripting "onXXX" are also described
  2555. * as CDATA but SGML allows entities references in attributes so their
  2556. * processing is identical as other attributes
  2557. */
  2558. static void
  2559. htmlParseScript(htmlParserCtxtPtr ctxt) {
  2560. xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
  2561. int nbchar = 0;
  2562. int cur,l;
  2563. SHRINK;
  2564. cur = CUR_CHAR(l);
  2565. while (IS_CHAR_CH(cur)) {
  2566. if ((cur == '<') && (NXT(1) == '/')) {
  2567. /*
  2568. * One should break here, the specification is clear:
  2569. * Authors should therefore escape "</" within the content.
  2570. * Escape mechanisms are specific to each scripting or
  2571. * style sheet language.
  2572. *
  2573. * In recovery mode, only break if end tag match the
  2574. * current tag, effectively ignoring all tags inside the
  2575. * script/style block and treating the entire block as
  2576. * CDATA.
  2577. */
  2578. if (ctxt->recovery) {
  2579. if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
  2580. xmlStrlen(ctxt->name)) == 0)
  2581. {
  2582. break; /* while */
  2583. } else {
  2584. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  2585. "Element %s embeds close tag\n",
  2586. ctxt->name, NULL);
  2587. }
  2588. } else {
  2589. if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
  2590. ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
  2591. {
  2592. break; /* while */
  2593. }
  2594. }
  2595. }
  2596. COPY_BUF(l,buf,nbchar,cur);
  2597. if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
  2598. if (ctxt->sax->cdataBlock!= NULL) {
  2599. /*
  2600. * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
  2601. */
  2602. ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
  2603. } else if (ctxt->sax->characters != NULL) {
  2604. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2605. }
  2606. nbchar = 0;
  2607. }
  2608. GROW;
  2609. NEXTL(l);
  2610. cur = CUR_CHAR(l);
  2611. }
  2612. if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
  2613. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2614. "Invalid char in CDATA 0x%X\n", cur);
  2615. NEXT;
  2616. }
  2617. if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2618. if (ctxt->sax->cdataBlock!= NULL) {
  2619. /*
  2620. * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
  2621. */
  2622. ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
  2623. } else if (ctxt->sax->characters != NULL) {
  2624. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2625. }
  2626. }
  2627. }
  2628. /**
  2629. * htmlParseCharData:
  2630. * @ctxt: an HTML parser context
  2631. *
  2632. * parse a CharData section.
  2633. * if we are within a CDATA section ']]>' marks an end of section.
  2634. *
  2635. * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  2636. */
  2637. static void
  2638. htmlParseCharData(htmlParserCtxtPtr ctxt) {
  2639. xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
  2640. int nbchar = 0;
  2641. int cur, l;
  2642. int chunk = 0;
  2643. SHRINK;
  2644. cur = CUR_CHAR(l);
  2645. while (((cur != '<') || (ctxt->token == '<')) &&
  2646. ((cur != '&') || (ctxt->token == '&')) &&
  2647. (cur != 0)) {
  2648. if (!(IS_CHAR(cur))) {
  2649. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  2650. "Invalid char in CDATA 0x%X\n", cur);
  2651. } else {
  2652. COPY_BUF(l,buf,nbchar,cur);
  2653. }
  2654. if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
  2655. /*
  2656. * Ok the segment is to be consumed as chars.
  2657. */
  2658. if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2659. if (areBlanks(ctxt, buf, nbchar)) {
  2660. if (ctxt->sax->ignorableWhitespace != NULL)
  2661. ctxt->sax->ignorableWhitespace(ctxt->userData,
  2662. buf, nbchar);
  2663. } else {
  2664. htmlCheckParagraph(ctxt);
  2665. if (ctxt->sax->characters != NULL)
  2666. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2667. }
  2668. }
  2669. nbchar = 0;
  2670. }
  2671. NEXTL(l);
  2672. chunk++;
  2673. if (chunk > HTML_PARSER_BUFFER_SIZE) {
  2674. chunk = 0;
  2675. SHRINK;
  2676. GROW;
  2677. }
  2678. cur = CUR_CHAR(l);
  2679. if (cur == 0) {
  2680. SHRINK;
  2681. GROW;
  2682. cur = CUR_CHAR(l);
  2683. }
  2684. }
  2685. if (nbchar != 0) {
  2686. buf[nbchar] = 0;
  2687. /*
  2688. * Ok the segment is to be consumed as chars.
  2689. */
  2690. if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
  2691. if (areBlanks(ctxt, buf, nbchar)) {
  2692. if (ctxt->sax->ignorableWhitespace != NULL)
  2693. ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
  2694. } else {
  2695. htmlCheckParagraph(ctxt);
  2696. if (ctxt->sax->characters != NULL)
  2697. ctxt->sax->characters(ctxt->userData, buf, nbchar);
  2698. }
  2699. }
  2700. } else {
  2701. /*
  2702. * Loop detection
  2703. */
  2704. if (cur == 0)
  2705. ctxt->instate = XML_PARSER_EOF;
  2706. }
  2707. }
  2708. /**
  2709. * htmlParseExternalID:
  2710. * @ctxt: an HTML parser context
  2711. * @publicID: a xmlChar** receiving PubidLiteral
  2712. *
  2713. * Parse an External ID or a Public ID
  2714. *
  2715. * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
  2716. * | 'PUBLIC' S PubidLiteral S SystemLiteral
  2717. *
  2718. * [83] PublicID ::= 'PUBLIC' S PubidLiteral
  2719. *
  2720. * Returns the function returns SystemLiteral and in the second
  2721. * case publicID receives PubidLiteral, is strict is off
  2722. * it is possible to return NULL and have publicID set.
  2723. */
  2724. static xmlChar *
  2725. htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
  2726. xmlChar *URI = NULL;
  2727. if ((UPPER == 'S') && (UPP(1) == 'Y') &&
  2728. (UPP(2) == 'S') && (UPP(3) == 'T') &&
  2729. (UPP(4) == 'E') && (UPP(5) == 'M')) {
  2730. SKIP(6);
  2731. if (!IS_BLANK_CH(CUR)) {
  2732. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  2733. "Space required after 'SYSTEM'\n", NULL, NULL);
  2734. }
  2735. SKIP_BLANKS;
  2736. URI = htmlParseSystemLiteral(ctxt);
  2737. if (URI == NULL) {
  2738. htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
  2739. "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
  2740. }
  2741. } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
  2742. (UPP(2) == 'B') && (UPP(3) == 'L') &&
  2743. (UPP(4) == 'I') && (UPP(5) == 'C')) {
  2744. SKIP(6);
  2745. if (!IS_BLANK_CH(CUR)) {
  2746. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  2747. "Space required after 'PUBLIC'\n", NULL, NULL);
  2748. }
  2749. SKIP_BLANKS;
  2750. *publicID = htmlParsePubidLiteral(ctxt);
  2751. if (*publicID == NULL) {
  2752. htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
  2753. "htmlParseExternalID: PUBLIC, no Public Identifier\n",
  2754. NULL, NULL);
  2755. }
  2756. SKIP_BLANKS;
  2757. if ((CUR == '"') || (CUR == '\'')) {
  2758. URI = htmlParseSystemLiteral(ctxt);
  2759. }
  2760. }
  2761. return(URI);
  2762. }
  2763. /**
  2764. * xmlParsePI:
  2765. * @ctxt: an XML parser context
  2766. *
  2767. * parse an XML Processing Instruction.
  2768. *
  2769. * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  2770. */
  2771. static void
  2772. htmlParsePI(htmlParserCtxtPtr ctxt) {
  2773. xmlChar *buf = NULL;
  2774. int len = 0;
  2775. int size = HTML_PARSER_BUFFER_SIZE;
  2776. int cur, l;
  2777. const xmlChar *target;
  2778. xmlParserInputState state;
  2779. int count = 0;
  2780. if ((RAW == '<') && (NXT(1) == '?')) {
  2781. state = ctxt->instate;
  2782. ctxt->instate = XML_PARSER_PI;
  2783. /*
  2784. * this is a Processing Instruction.
  2785. */
  2786. SKIP(2);
  2787. SHRINK;
  2788. /*
  2789. * Parse the target name and check for special support like
  2790. * namespace.
  2791. */
  2792. target = htmlParseName(ctxt);
  2793. if (target != NULL) {
  2794. if (RAW == '>') {
  2795. SKIP(1);
  2796. /*
  2797. * SAX: PI detected.
  2798. */
  2799. if ((ctxt->sax) && (!ctxt->disableSAX) &&
  2800. (ctxt->sax->processingInstruction != NULL))
  2801. ctxt->sax->processingInstruction(ctxt->userData,
  2802. target, NULL);
  2803. ctxt->instate = state;
  2804. return;
  2805. }
  2806. buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
  2807. if (buf == NULL) {
  2808. htmlErrMemory(ctxt, NULL);
  2809. ctxt->instate = state;
  2810. return;
  2811. }
  2812. cur = CUR;
  2813. if (!IS_BLANK(cur)) {
  2814. htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
  2815. "ParsePI: PI %s space expected\n", target, NULL);
  2816. }
  2817. SKIP_BLANKS;
  2818. cur = CUR_CHAR(l);
  2819. while (IS_CHAR(cur) && (cur != '>')) {
  2820. if (len + 5 >= size) {
  2821. xmlChar *tmp;
  2822. size *= 2;
  2823. tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
  2824. if (tmp == NULL) {
  2825. htmlErrMemory(ctxt, NULL);
  2826. xmlFree(buf);
  2827. ctxt->instate = state;
  2828. return;
  2829. }
  2830. buf = tmp;
  2831. }
  2832. count++;
  2833. if (count > 50) {
  2834. GROW;
  2835. count = 0;
  2836. }
  2837. COPY_BUF(l,buf,len,cur);
  2838. NEXTL(l);
  2839. cur = CUR_CHAR(l);
  2840. if (cur == 0) {
  2841. SHRINK;
  2842. GROW;
  2843. cur = CUR_CHAR(l);
  2844. }
  2845. }
  2846. buf[len] = 0;
  2847. if (cur != '>') {
  2848. htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
  2849. "ParsePI: PI %s never end ...\n", target, NULL);
  2850. } else {
  2851. SKIP(1);
  2852. /*
  2853. * SAX: PI detected.
  2854. */
  2855. if ((ctxt->sax) && (!ctxt->disableSAX) &&
  2856. (ctxt->sax->processingInstruction != NULL))
  2857. ctxt->sax->processingInstruction(ctxt->userData,
  2858. target, buf);
  2859. }
  2860. xmlFree(buf);
  2861. } else {
  2862. htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
  2863. "PI is not started correctly", NULL, NULL);
  2864. }
  2865. ctxt->instate = state;
  2866. }
  2867. }
  2868. /**
  2869. * htmlParseComment:
  2870. * @ctxt: an HTML parser context
  2871. *
  2872. * Parse an XML (SGML) comment <!-- .... -->
  2873. *
  2874. * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
  2875. */
  2876. static void
  2877. htmlParseComment(htmlParserCtxtPtr ctxt) {
  2878. xmlChar *buf = NULL;
  2879. int len;
  2880. int size = HTML_PARSER_BUFFER_SIZE;
  2881. int q, ql;
  2882. int r, rl;
  2883. int cur, l;
  2884. xmlParserInputState state;
  2885. /*
  2886. * Check that there is a comment right here.
  2887. */
  2888. if ((RAW != '<') || (NXT(1) != '!') ||
  2889. (NXT(2) != '-') || (NXT(3) != '-')) return;
  2890. state = ctxt->instate;
  2891. ctxt->instate = XML_PARSER_COMMENT;
  2892. SHRINK;
  2893. SKIP(4);
  2894. buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
  2895. if (buf == NULL) {
  2896. htmlErrMemory(ctxt, "buffer allocation failed\n");
  2897. ctxt->instate = state;
  2898. return;
  2899. }
  2900. q = CUR_CHAR(ql);
  2901. NEXTL(ql);
  2902. r = CUR_CHAR(rl);
  2903. NEXTL(rl);
  2904. cur = CUR_CHAR(l);
  2905. len = 0;
  2906. while (IS_CHAR(cur) &&
  2907. ((cur != '>') ||
  2908. (r != '-') || (q != '-'))) {
  2909. if (len + 5 >= size) {
  2910. xmlChar *tmp;
  2911. size *= 2;
  2912. tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
  2913. if (tmp == NULL) {
  2914. xmlFree(buf);
  2915. htmlErrMemory(ctxt, "growing buffer failed\n");
  2916. ctxt->instate = state;
  2917. return;
  2918. }
  2919. buf = tmp;
  2920. }
  2921. COPY_BUF(ql,buf,len,q);
  2922. q = r;
  2923. ql = rl;
  2924. r = cur;
  2925. rl = l;
  2926. NEXTL(l);
  2927. cur = CUR_CHAR(l);
  2928. if (cur == 0) {
  2929. SHRINK;
  2930. GROW;
  2931. cur = CUR_CHAR(l);
  2932. }
  2933. }
  2934. buf[len] = 0;
  2935. if (!IS_CHAR(cur)) {
  2936. htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
  2937. "Comment not terminated \n<!--%.50s\n", buf, NULL);
  2938. xmlFree(buf);
  2939. } else {
  2940. NEXT;
  2941. if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
  2942. (!ctxt->disableSAX))
  2943. ctxt->sax->comment(ctxt->userData, buf);
  2944. xmlFree(buf);
  2945. }
  2946. ctxt->instate = state;
  2947. }
  2948. /**
  2949. * htmlParseCharRef:
  2950. * @ctxt: an HTML parser context
  2951. *
  2952. * parse Reference declarations
  2953. *
  2954. * [66] CharRef ::= '&#' [0-9]+ ';' |
  2955. * '&#x' [0-9a-fA-F]+ ';'
  2956. *
  2957. * Returns the value parsed (as an int)
  2958. */
  2959. int
  2960. htmlParseCharRef(htmlParserCtxtPtr ctxt) {
  2961. int val = 0;
  2962. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  2963. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  2964. "htmlParseCharRef: context error\n",
  2965. NULL, NULL);
  2966. return(0);
  2967. }
  2968. if ((CUR == '&') && (NXT(1) == '#') &&
  2969. ((NXT(2) == 'x') || NXT(2) == 'X')) {
  2970. SKIP(3);
  2971. while (CUR != ';') {
  2972. if ((CUR >= '0') && (CUR <= '9'))
  2973. val = val * 16 + (CUR - '0');
  2974. else if ((CUR >= 'a') && (CUR <= 'f'))
  2975. val = val * 16 + (CUR - 'a') + 10;
  2976. else if ((CUR >= 'A') && (CUR <= 'F'))
  2977. val = val * 16 + (CUR - 'A') + 10;
  2978. else {
  2979. htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
  2980. "htmlParseCharRef: missing semicolumn\n",
  2981. NULL, NULL);
  2982. break;
  2983. }
  2984. NEXT;
  2985. }
  2986. if (CUR == ';')
  2987. NEXT;
  2988. } else if ((CUR == '&') && (NXT(1) == '#')) {
  2989. SKIP(2);
  2990. while (CUR != ';') {
  2991. if ((CUR >= '0') && (CUR <= '9'))
  2992. val = val * 10 + (CUR - '0');
  2993. else {
  2994. htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
  2995. "htmlParseCharRef: missing semicolumn\n",
  2996. NULL, NULL);
  2997. break;
  2998. }
  2999. NEXT;
  3000. }
  3001. if (CUR == ';')
  3002. NEXT;
  3003. } else {
  3004. htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
  3005. "htmlParseCharRef: invalid value\n", NULL, NULL);
  3006. }
  3007. /*
  3008. * Check the value IS_CHAR ...
  3009. */
  3010. if (IS_CHAR(val)) {
  3011. return(val);
  3012. } else {
  3013. htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
  3014. "htmlParseCharRef: invalid xmlChar value %d\n",
  3015. val);
  3016. }
  3017. return(0);
  3018. }
  3019. /**
  3020. * htmlParseDocTypeDecl:
  3021. * @ctxt: an HTML parser context
  3022. *
  3023. * parse a DOCTYPE declaration
  3024. *
  3025. * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
  3026. * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
  3027. */
  3028. static void
  3029. htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
  3030. const xmlChar *name;
  3031. xmlChar *ExternalID = NULL;
  3032. xmlChar *URI = NULL;
  3033. /*
  3034. * We know that '<!DOCTYPE' has been detected.
  3035. */
  3036. SKIP(9);
  3037. SKIP_BLANKS;
  3038. /*
  3039. * Parse the DOCTYPE name.
  3040. */
  3041. name = htmlParseName(ctxt);
  3042. if (name == NULL) {
  3043. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3044. "htmlParseDocTypeDecl : no DOCTYPE name !\n",
  3045. NULL, NULL);
  3046. }
  3047. /*
  3048. * Check that upper(name) == "HTML" !!!!!!!!!!!!!
  3049. */
  3050. SKIP_BLANKS;
  3051. /*
  3052. * Check for SystemID and ExternalID
  3053. */
  3054. URI = htmlParseExternalID(ctxt, &ExternalID);
  3055. SKIP_BLANKS;
  3056. /*
  3057. * We should be at the end of the DOCTYPE declaration.
  3058. */
  3059. if (CUR != '>') {
  3060. htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
  3061. "DOCTYPE improperly terminated\n", NULL, NULL);
  3062. /* We shouldn't try to resynchronize ... */
  3063. }
  3064. NEXT;
  3065. /*
  3066. * Create or update the document accordingly to the DOCTYPE
  3067. */
  3068. if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
  3069. (!ctxt->disableSAX))
  3070. ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
  3071. /*
  3072. * Cleanup, since we don't use all those identifiers
  3073. */
  3074. if (URI != NULL) xmlFree(URI);
  3075. if (ExternalID != NULL) xmlFree(ExternalID);
  3076. }
  3077. /**
  3078. * htmlParseAttribute:
  3079. * @ctxt: an HTML parser context
  3080. * @value: a xmlChar ** used to store the value of the attribute
  3081. *
  3082. * parse an attribute
  3083. *
  3084. * [41] Attribute ::= Name Eq AttValue
  3085. *
  3086. * [25] Eq ::= S? '=' S?
  3087. *
  3088. * With namespace:
  3089. *
  3090. * [NS 11] Attribute ::= QName Eq AttValue
  3091. *
  3092. * Also the case QName == xmlns:??? is handled independently as a namespace
  3093. * definition.
  3094. *
  3095. * Returns the attribute name, and the value in *value.
  3096. */
  3097. static const xmlChar *
  3098. htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
  3099. const xmlChar *name;
  3100. xmlChar *val = NULL;
  3101. *value = NULL;
  3102. name = htmlParseHTMLName(ctxt);
  3103. if (name == NULL) {
  3104. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3105. "error parsing attribute name\n", NULL, NULL);
  3106. return(NULL);
  3107. }
  3108. /*
  3109. * read the value
  3110. */
  3111. SKIP_BLANKS;
  3112. if (CUR == '=') {
  3113. NEXT;
  3114. SKIP_BLANKS;
  3115. val = htmlParseAttValue(ctxt);
  3116. } else if (htmlIsBooleanAttr(name)) {
  3117. /*
  3118. * assume a minimized attribute
  3119. */
  3120. val = xmlStrdup(name);
  3121. }
  3122. *value = val;
  3123. return(name);
  3124. }
  3125. /**
  3126. * htmlCheckEncoding:
  3127. * @ctxt: an HTML parser context
  3128. * @attvalue: the attribute value
  3129. *
  3130. * Checks an http-equiv attribute from a Meta tag to detect
  3131. * the encoding
  3132. * If a new encoding is detected the parser is switched to decode
  3133. * it and pass UTF8
  3134. */
  3135. static void
  3136. htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
  3137. const xmlChar *encoding;
  3138. if ((ctxt == NULL) || (attvalue == NULL))
  3139. return;
  3140. /* do not change encoding */
  3141. if (ctxt->input->encoding != NULL)
  3142. return;
  3143. encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
  3144. if (encoding != NULL) {
  3145. encoding += 8;
  3146. } else {
  3147. encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
  3148. if (encoding != NULL)
  3149. encoding += 9;
  3150. }
  3151. if (encoding != NULL) {
  3152. xmlCharEncoding enc;
  3153. xmlCharEncodingHandlerPtr handler;
  3154. while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
  3155. if (ctxt->input->encoding != NULL)
  3156. xmlFree((xmlChar *) ctxt->input->encoding);
  3157. ctxt->input->encoding = xmlStrdup(encoding);
  3158. enc = xmlParseCharEncoding((const char *) encoding);
  3159. /*
  3160. * registered set of known encodings
  3161. */
  3162. if (enc != XML_CHAR_ENCODING_ERROR) {
  3163. if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
  3164. (enc == XML_CHAR_ENCODING_UTF16BE) ||
  3165. (enc == XML_CHAR_ENCODING_UCS4LE) ||
  3166. (enc == XML_CHAR_ENCODING_UCS4BE)) &&
  3167. (ctxt->input->buf != NULL) &&
  3168. (ctxt->input->buf->encoder == NULL)) {
  3169. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  3170. "htmlCheckEncoding: wrong encoding meta\n",
  3171. NULL, NULL);
  3172. } else {
  3173. xmlSwitchEncoding(ctxt, enc);
  3174. }
  3175. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  3176. } else {
  3177. /*
  3178. * fallback for unknown encodings
  3179. */
  3180. handler = xmlFindCharEncodingHandler((const char *) encoding);
  3181. if (handler != NULL) {
  3182. xmlSwitchToEncoding(ctxt, handler);
  3183. ctxt->charset = XML_CHAR_ENCODING_UTF8;
  3184. } else {
  3185. ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
  3186. }
  3187. }
  3188. if ((ctxt->input->buf != NULL) &&
  3189. (ctxt->input->buf->encoder != NULL) &&
  3190. (ctxt->input->buf->raw != NULL) &&
  3191. (ctxt->input->buf->buffer != NULL)) {
  3192. int nbchars;
  3193. int processed;
  3194. /*
  3195. * convert as much as possible to the parser reading buffer.
  3196. */
  3197. processed = ctxt->input->cur - ctxt->input->base;
  3198. xmlBufferShrink(ctxt->input->buf->buffer, processed);
  3199. nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
  3200. ctxt->input->buf->buffer,
  3201. ctxt->input->buf->raw);
  3202. if (nbchars < 0) {
  3203. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  3204. "htmlCheckEncoding: encoder error\n",
  3205. NULL, NULL);
  3206. }
  3207. ctxt->input->base =
  3208. ctxt->input->cur = ctxt->input->buf->buffer->content;
  3209. }
  3210. }
  3211. }
  3212. /**
  3213. * htmlCheckMeta:
  3214. * @ctxt: an HTML parser context
  3215. * @atts: the attributes values
  3216. *
  3217. * Checks an attributes from a Meta tag
  3218. */
  3219. static void
  3220. htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
  3221. int i;
  3222. const xmlChar *att, *value;
  3223. int http = 0;
  3224. const xmlChar *content = NULL;
  3225. if ((ctxt == NULL) || (atts == NULL))
  3226. return;
  3227. i = 0;
  3228. att = atts[i++];
  3229. while (att != NULL) {
  3230. value = atts[i++];
  3231. if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
  3232. && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
  3233. http = 1;
  3234. else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
  3235. content = value;
  3236. att = atts[i++];
  3237. }
  3238. if ((http) && (content != NULL))
  3239. htmlCheckEncoding(ctxt, content);
  3240. }
  3241. /**
  3242. * htmlParseStartTag:
  3243. * @ctxt: an HTML parser context
  3244. *
  3245. * parse a start of tag either for rule element or
  3246. * EmptyElement. In both case we don't parse the tag closing chars.
  3247. *
  3248. * [40] STag ::= '<' Name (S Attribute)* S? '>'
  3249. *
  3250. * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
  3251. *
  3252. * With namespace:
  3253. *
  3254. * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
  3255. *
  3256. * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
  3257. *
  3258. * Returns 0 in case of success, -1 in case of error and 1 if discarded
  3259. */
  3260. static int
  3261. htmlParseStartTag(htmlParserCtxtPtr ctxt) {
  3262. const xmlChar *name;
  3263. const xmlChar *attname;
  3264. xmlChar *attvalue;
  3265. const xmlChar **atts;
  3266. int nbatts = 0;
  3267. int maxatts;
  3268. int meta = 0;
  3269. int i;
  3270. int discardtag = 0;
  3271. if (ctxt->instate == XML_PARSER_EOF)
  3272. return(-1);
  3273. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3274. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3275. "htmlParseStartTag: context error\n", NULL, NULL);
  3276. return -1;
  3277. }
  3278. if (CUR != '<') return -1;
  3279. NEXT;
  3280. atts = ctxt->atts;
  3281. maxatts = ctxt->maxatts;
  3282. GROW;
  3283. name = htmlParseHTMLName(ctxt);
  3284. if (name == NULL) {
  3285. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3286. "htmlParseStartTag: invalid element name\n",
  3287. NULL, NULL);
  3288. /* Dump the bogus tag like browsers do */
  3289. while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
  3290. (ctxt->instate != XML_PARSER_EOF))
  3291. NEXT;
  3292. return -1;
  3293. }
  3294. if (xmlStrEqual(name, BAD_CAST"meta"))
  3295. meta = 1;
  3296. /*
  3297. * Check for auto-closure of HTML elements.
  3298. */
  3299. htmlAutoClose(ctxt, name);
  3300. /*
  3301. * Check for implied HTML elements.
  3302. */
  3303. htmlCheckImplied(ctxt, name);
  3304. /*
  3305. * Avoid html at any level > 0, head at any level != 1
  3306. * or any attempt to recurse body
  3307. */
  3308. if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
  3309. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3310. "htmlParseStartTag: misplaced <html> tag\n",
  3311. name, NULL);
  3312. discardtag = 1;
  3313. ctxt->depth++;
  3314. }
  3315. if ((ctxt->nameNr != 1) &&
  3316. (xmlStrEqual(name, BAD_CAST"head"))) {
  3317. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3318. "htmlParseStartTag: misplaced <head> tag\n",
  3319. name, NULL);
  3320. discardtag = 1;
  3321. ctxt->depth++;
  3322. }
  3323. if (xmlStrEqual(name, BAD_CAST"body")) {
  3324. int indx;
  3325. for (indx = 0;indx < ctxt->nameNr;indx++) {
  3326. if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
  3327. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3328. "htmlParseStartTag: misplaced <body> tag\n",
  3329. name, NULL);
  3330. discardtag = 1;
  3331. ctxt->depth++;
  3332. }
  3333. }
  3334. }
  3335. /*
  3336. * Now parse the attributes, it ends up with the ending
  3337. *
  3338. * (S Attribute)* S?
  3339. */
  3340. SKIP_BLANKS;
  3341. while ((IS_CHAR_CH(CUR)) &&
  3342. (CUR != '>') &&
  3343. ((CUR != '/') || (NXT(1) != '>'))) {
  3344. long cons = ctxt->nbChars;
  3345. GROW;
  3346. attname = htmlParseAttribute(ctxt, &attvalue);
  3347. if (attname != NULL) {
  3348. /*
  3349. * Well formedness requires at most one declaration of an attribute
  3350. */
  3351. for (i = 0; i < nbatts;i += 2) {
  3352. if (xmlStrEqual(atts[i], attname)) {
  3353. htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
  3354. "Attribute %s redefined\n", attname, NULL);
  3355. if (attvalue != NULL)
  3356. xmlFree(attvalue);
  3357. goto failed;
  3358. }
  3359. }
  3360. /*
  3361. * Add the pair to atts
  3362. */
  3363. if (atts == NULL) {
  3364. maxatts = 22; /* allow for 10 attrs by default */
  3365. atts = (const xmlChar **)
  3366. xmlMalloc(maxatts * sizeof(xmlChar *));
  3367. if (atts == NULL) {
  3368. htmlErrMemory(ctxt, NULL);
  3369. if (attvalue != NULL)
  3370. xmlFree(attvalue);
  3371. goto failed;
  3372. }
  3373. ctxt->atts = atts;
  3374. ctxt->maxatts = maxatts;
  3375. } else if (nbatts + 4 > maxatts) {
  3376. const xmlChar **n;
  3377. maxatts *= 2;
  3378. n = (const xmlChar **) xmlRealloc((void *) atts,
  3379. maxatts * sizeof(const xmlChar *));
  3380. if (n == NULL) {
  3381. htmlErrMemory(ctxt, NULL);
  3382. if (attvalue != NULL)
  3383. xmlFree(attvalue);
  3384. goto failed;
  3385. }
  3386. atts = n;
  3387. ctxt->atts = atts;
  3388. ctxt->maxatts = maxatts;
  3389. }
  3390. atts[nbatts++] = attname;
  3391. atts[nbatts++] = attvalue;
  3392. atts[nbatts] = NULL;
  3393. atts[nbatts + 1] = NULL;
  3394. }
  3395. else {
  3396. if (attvalue != NULL)
  3397. xmlFree(attvalue);
  3398. /* Dump the bogus attribute string up to the next blank or
  3399. * the end of the tag. */
  3400. while ((IS_CHAR_CH(CUR)) &&
  3401. !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
  3402. ((CUR != '/') || (NXT(1) != '>')))
  3403. NEXT;
  3404. }
  3405. failed:
  3406. SKIP_BLANKS;
  3407. if (cons == ctxt->nbChars) {
  3408. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3409. "htmlParseStartTag: problem parsing attributes\n",
  3410. NULL, NULL);
  3411. break;
  3412. }
  3413. }
  3414. /*
  3415. * Handle specific association to the META tag
  3416. */
  3417. if (meta && (nbatts != 0))
  3418. htmlCheckMeta(ctxt, atts);
  3419. /*
  3420. * SAX: Start of Element !
  3421. */
  3422. if (!discardtag) {
  3423. htmlnamePush(ctxt, name);
  3424. if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
  3425. if (nbatts != 0)
  3426. ctxt->sax->startElement(ctxt->userData, name, atts);
  3427. else
  3428. ctxt->sax->startElement(ctxt->userData, name, NULL);
  3429. }
  3430. }
  3431. if (atts != NULL) {
  3432. for (i = 1;i < nbatts;i += 2) {
  3433. if (atts[i] != NULL)
  3434. xmlFree((xmlChar *) atts[i]);
  3435. }
  3436. }
  3437. return(discardtag);
  3438. }
  3439. /**
  3440. * htmlParseEndTag:
  3441. * @ctxt: an HTML parser context
  3442. *
  3443. * parse an end of tag
  3444. *
  3445. * [42] ETag ::= '</' Name S? '>'
  3446. *
  3447. * With namespace
  3448. *
  3449. * [NS 9] ETag ::= '</' QName S? '>'
  3450. *
  3451. * Returns 1 if the current level should be closed.
  3452. */
  3453. static int
  3454. htmlParseEndTag(htmlParserCtxtPtr ctxt)
  3455. {
  3456. const xmlChar *name;
  3457. const xmlChar *oldname;
  3458. int i, ret;
  3459. if ((CUR != '<') || (NXT(1) != '/')) {
  3460. htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
  3461. "htmlParseEndTag: '</' not found\n", NULL, NULL);
  3462. return (0);
  3463. }
  3464. SKIP(2);
  3465. name = htmlParseHTMLName(ctxt);
  3466. if (name == NULL)
  3467. return (0);
  3468. /*
  3469. * We should definitely be at the ending "S? '>'" part
  3470. */
  3471. SKIP_BLANKS;
  3472. if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
  3473. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  3474. "End tag : expected '>'\n", NULL, NULL);
  3475. if (ctxt->recovery) {
  3476. /*
  3477. * We're not at the ending > !!
  3478. * Error, unless in recover mode where we search forwards
  3479. * until we find a >
  3480. */
  3481. while (CUR != '\0' && CUR != '>') NEXT;
  3482. NEXT;
  3483. }
  3484. } else
  3485. NEXT;
  3486. /*
  3487. * if we ignored misplaced tags in htmlParseStartTag don't pop them
  3488. * out now.
  3489. */
  3490. if ((ctxt->depth > 0) &&
  3491. (xmlStrEqual(name, BAD_CAST "html") ||
  3492. xmlStrEqual(name, BAD_CAST "body") ||
  3493. xmlStrEqual(name, BAD_CAST "head"))) {
  3494. ctxt->depth--;
  3495. return (0);
  3496. }
  3497. /*
  3498. * If the name read is not one of the element in the parsing stack
  3499. * then return, it's just an error.
  3500. */
  3501. for (i = (ctxt->nameNr - 1); i >= 0; i--) {
  3502. if (xmlStrEqual(name, ctxt->nameTab[i]))
  3503. break;
  3504. }
  3505. if (i < 0) {
  3506. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  3507. "Unexpected end tag : %s\n", name, NULL);
  3508. return (0);
  3509. }
  3510. /*
  3511. * Check for auto-closure of HTML elements.
  3512. */
  3513. htmlAutoCloseOnClose(ctxt, name);
  3514. /*
  3515. * Well formedness constraints, opening and closing must match.
  3516. * With the exception that the autoclose may have popped stuff out
  3517. * of the stack.
  3518. */
  3519. if (!xmlStrEqual(name, ctxt->name)) {
  3520. if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
  3521. htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
  3522. "Opening and ending tag mismatch: %s and %s\n",
  3523. name, ctxt->name);
  3524. }
  3525. }
  3526. /*
  3527. * SAX: End of Tag
  3528. */
  3529. oldname = ctxt->name;
  3530. if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
  3531. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  3532. ctxt->sax->endElement(ctxt->userData, name);
  3533. htmlnamePop(ctxt);
  3534. ret = 1;
  3535. } else {
  3536. ret = 0;
  3537. }
  3538. return (ret);
  3539. }
  3540. /**
  3541. * htmlParseReference:
  3542. * @ctxt: an HTML parser context
  3543. *
  3544. * parse and handle entity references in content,
  3545. * this will end-up in a call to character() since this is either a
  3546. * CharRef, or a predefined entity.
  3547. */
  3548. static void
  3549. htmlParseReference(htmlParserCtxtPtr ctxt) {
  3550. const htmlEntityDesc * ent;
  3551. xmlChar out[6];
  3552. const xmlChar *name;
  3553. if (CUR != '&') return;
  3554. if (NXT(1) == '#') {
  3555. unsigned int c;
  3556. int bits, i = 0;
  3557. c = htmlParseCharRef(ctxt);
  3558. if (c == 0)
  3559. return;
  3560. if (c < 0x80) { out[i++]= c; bits= -6; }
  3561. else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  3562. else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  3563. else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
  3564. for ( ; bits >= 0; bits-= 6) {
  3565. out[i++]= ((c >> bits) & 0x3F) | 0x80;
  3566. }
  3567. out[i] = 0;
  3568. htmlCheckParagraph(ctxt);
  3569. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3570. ctxt->sax->characters(ctxt->userData, out, i);
  3571. } else {
  3572. ent = htmlParseEntityRef(ctxt, &name);
  3573. if (name == NULL) {
  3574. htmlCheckParagraph(ctxt);
  3575. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3576. ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
  3577. return;
  3578. }
  3579. if ((ent == NULL) || !(ent->value > 0)) {
  3580. htmlCheckParagraph(ctxt);
  3581. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
  3582. ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
  3583. ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
  3584. /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
  3585. }
  3586. } else {
  3587. unsigned int c;
  3588. int bits, i = 0;
  3589. c = ent->value;
  3590. if (c < 0x80)
  3591. { out[i++]= c; bits= -6; }
  3592. else if (c < 0x800)
  3593. { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
  3594. else if (c < 0x10000)
  3595. { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
  3596. else
  3597. { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
  3598. for ( ; bits >= 0; bits-= 6) {
  3599. out[i++]= ((c >> bits) & 0x3F) | 0x80;
  3600. }
  3601. out[i] = 0;
  3602. htmlCheckParagraph(ctxt);
  3603. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  3604. ctxt->sax->characters(ctxt->userData, out, i);
  3605. }
  3606. }
  3607. }
  3608. /**
  3609. * htmlParseContent:
  3610. * @ctxt: an HTML parser context
  3611. *
  3612. * Parse a content: comment, sub-element, reference or text.
  3613. */
  3614. static void
  3615. htmlParseContent(htmlParserCtxtPtr ctxt) {
  3616. xmlChar *currentNode;
  3617. int depth;
  3618. const xmlChar *name;
  3619. currentNode = xmlStrdup(ctxt->name);
  3620. depth = ctxt->nameNr;
  3621. while (1) {
  3622. long cons = ctxt->nbChars;
  3623. GROW;
  3624. if (ctxt->instate == XML_PARSER_EOF)
  3625. break;
  3626. /*
  3627. * Our tag or one of it's parent or children is ending.
  3628. */
  3629. if ((CUR == '<') && (NXT(1) == '/')) {
  3630. if (htmlParseEndTag(ctxt) &&
  3631. ((currentNode != NULL) || (ctxt->nameNr == 0))) {
  3632. if (currentNode != NULL)
  3633. xmlFree(currentNode);
  3634. return;
  3635. }
  3636. continue; /* while */
  3637. }
  3638. else if ((CUR == '<') &&
  3639. ((IS_ASCII_LETTER(NXT(1))) ||
  3640. (NXT(1) == '_') || (NXT(1) == ':'))) {
  3641. name = htmlParseHTMLName_nonInvasive(ctxt);
  3642. if (name == NULL) {
  3643. htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
  3644. "htmlParseStartTag: invalid element name\n",
  3645. NULL, NULL);
  3646. /* Dump the bogus tag like browsers do */
  3647. while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
  3648. NEXT;
  3649. if (currentNode != NULL)
  3650. xmlFree(currentNode);
  3651. return;
  3652. }
  3653. if (ctxt->name != NULL) {
  3654. if (htmlCheckAutoClose(name, ctxt->name) == 1) {
  3655. htmlAutoClose(ctxt, name);
  3656. continue;
  3657. }
  3658. }
  3659. }
  3660. /*
  3661. * Has this node been popped out during parsing of
  3662. * the next element
  3663. */
  3664. if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
  3665. (!xmlStrEqual(currentNode, ctxt->name)))
  3666. {
  3667. if (currentNode != NULL) xmlFree(currentNode);
  3668. return;
  3669. }
  3670. if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
  3671. (xmlStrEqual(currentNode, BAD_CAST"style")))) {
  3672. /*
  3673. * Handle SCRIPT/STYLE separately
  3674. */
  3675. htmlParseScript(ctxt);
  3676. } else {
  3677. /*
  3678. * Sometimes DOCTYPE arrives in the middle of the document
  3679. */
  3680. if ((CUR == '<') && (NXT(1) == '!') &&
  3681. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  3682. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  3683. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  3684. (UPP(8) == 'E')) {
  3685. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  3686. "Misplaced DOCTYPE declaration\n",
  3687. BAD_CAST "DOCTYPE" , NULL);
  3688. htmlParseDocTypeDecl(ctxt);
  3689. }
  3690. /*
  3691. * First case : a comment
  3692. */
  3693. if ((CUR == '<') && (NXT(1) == '!') &&
  3694. (NXT(2) == '-') && (NXT(3) == '-')) {
  3695. htmlParseComment(ctxt);
  3696. }
  3697. /*
  3698. * Second case : a Processing Instruction.
  3699. */
  3700. else if ((CUR == '<') && (NXT(1) == '?')) {
  3701. htmlParsePI(ctxt);
  3702. }
  3703. /*
  3704. * Third case : a sub-element.
  3705. */
  3706. else if (CUR == '<') {
  3707. htmlParseElement(ctxt);
  3708. }
  3709. /*
  3710. * Fourth case : a reference. If if has not been resolved,
  3711. * parsing returns it's Name, create the node
  3712. */
  3713. else if (CUR == '&') {
  3714. htmlParseReference(ctxt);
  3715. }
  3716. /*
  3717. * Fifth case : end of the resource
  3718. */
  3719. else if (CUR == 0) {
  3720. htmlAutoCloseOnEnd(ctxt);
  3721. break;
  3722. }
  3723. /*
  3724. * Last case, text. Note that References are handled directly.
  3725. */
  3726. else {
  3727. htmlParseCharData(ctxt);
  3728. }
  3729. if (cons == ctxt->nbChars) {
  3730. if (ctxt->node != NULL) {
  3731. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3732. "detected an error in element content\n",
  3733. NULL, NULL);
  3734. }
  3735. break;
  3736. }
  3737. }
  3738. GROW;
  3739. }
  3740. if (currentNode != NULL) xmlFree(currentNode);
  3741. }
  3742. /**
  3743. * htmlParseContent:
  3744. * @ctxt: an HTML parser context
  3745. *
  3746. * Parse a content: comment, sub-element, reference or text.
  3747. */
  3748. void
  3749. __htmlParseContent(void *ctxt) {
  3750. if (ctxt != NULL)
  3751. htmlParseContent((htmlParserCtxtPtr) ctxt);
  3752. }
  3753. /**
  3754. * htmlParseElement:
  3755. * @ctxt: an HTML parser context
  3756. *
  3757. * parse an HTML element, this is highly recursive
  3758. *
  3759. * [39] element ::= EmptyElemTag | STag content ETag
  3760. *
  3761. * [41] Attribute ::= Name Eq AttValue
  3762. */
  3763. void
  3764. htmlParseElement(htmlParserCtxtPtr ctxt) {
  3765. const xmlChar *name;
  3766. xmlChar *currentNode = NULL;
  3767. const htmlElemDesc * info;
  3768. htmlParserNodeInfo node_info;
  3769. int failed;
  3770. int depth;
  3771. const xmlChar *oldptr;
  3772. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3773. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3774. "htmlParseElement: context error\n", NULL, NULL);
  3775. return;
  3776. }
  3777. if (ctxt->instate == XML_PARSER_EOF)
  3778. return;
  3779. /* Capture start position */
  3780. if (ctxt->record_info) {
  3781. node_info.begin_pos = ctxt->input->consumed +
  3782. (CUR_PTR - ctxt->input->base);
  3783. node_info.begin_line = ctxt->input->line;
  3784. }
  3785. failed = htmlParseStartTag(ctxt);
  3786. name = ctxt->name;
  3787. if ((failed == -1) || (name == NULL)) {
  3788. if (CUR == '>')
  3789. NEXT;
  3790. return;
  3791. }
  3792. /*
  3793. * Lookup the info for that element.
  3794. */
  3795. info = htmlTagLookup(name);
  3796. if (info == NULL) {
  3797. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  3798. "Tag %s invalid\n", name, NULL);
  3799. }
  3800. /*
  3801. * Check for an Empty Element labeled the XML/SGML way
  3802. */
  3803. if ((CUR == '/') && (NXT(1) == '>')) {
  3804. SKIP(2);
  3805. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  3806. ctxt->sax->endElement(ctxt->userData, name);
  3807. htmlnamePop(ctxt);
  3808. return;
  3809. }
  3810. if (CUR == '>') {
  3811. NEXT;
  3812. } else {
  3813. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  3814. "Couldn't find end of Start Tag %s\n", name, NULL);
  3815. /*
  3816. * end of parsing of this node.
  3817. */
  3818. if (xmlStrEqual(name, ctxt->name)) {
  3819. nodePop(ctxt);
  3820. htmlnamePop(ctxt);
  3821. }
  3822. /*
  3823. * Capture end position and add node
  3824. */
  3825. if (ctxt->record_info) {
  3826. node_info.end_pos = ctxt->input->consumed +
  3827. (CUR_PTR - ctxt->input->base);
  3828. node_info.end_line = ctxt->input->line;
  3829. node_info.node = ctxt->node;
  3830. xmlParserAddNodeInfo(ctxt, &node_info);
  3831. }
  3832. return;
  3833. }
  3834. /*
  3835. * Check for an Empty Element from DTD definition
  3836. */
  3837. if ((info != NULL) && (info->empty)) {
  3838. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  3839. ctxt->sax->endElement(ctxt->userData, name);
  3840. htmlnamePop(ctxt);
  3841. return;
  3842. }
  3843. /*
  3844. * Parse the content of the element:
  3845. */
  3846. currentNode = xmlStrdup(ctxt->name);
  3847. depth = ctxt->nameNr;
  3848. while (IS_CHAR_CH(CUR)) {
  3849. oldptr = ctxt->input->cur;
  3850. htmlParseContent(ctxt);
  3851. if (oldptr==ctxt->input->cur) break;
  3852. if (ctxt->nameNr < depth) break;
  3853. }
  3854. /*
  3855. * Capture end position and add node
  3856. */
  3857. if ( currentNode != NULL && ctxt->record_info ) {
  3858. node_info.end_pos = ctxt->input->consumed +
  3859. (CUR_PTR - ctxt->input->base);
  3860. node_info.end_line = ctxt->input->line;
  3861. node_info.node = ctxt->node;
  3862. xmlParserAddNodeInfo(ctxt, &node_info);
  3863. }
  3864. if (!IS_CHAR_CH(CUR)) {
  3865. htmlAutoCloseOnEnd(ctxt);
  3866. }
  3867. if (currentNode != NULL)
  3868. xmlFree(currentNode);
  3869. }
  3870. /**
  3871. * htmlParseDocument:
  3872. * @ctxt: an HTML parser context
  3873. *
  3874. * parse an HTML document (and build a tree if using the standard SAX
  3875. * interface).
  3876. *
  3877. * Returns 0, -1 in case of error. the parser context is augmented
  3878. * as a result of the parsing.
  3879. */
  3880. int
  3881. htmlParseDocument(htmlParserCtxtPtr ctxt) {
  3882. xmlChar start[4];
  3883. xmlCharEncoding enc;
  3884. xmlDtdPtr dtd;
  3885. xmlInitParser();
  3886. htmlDefaultSAXHandlerInit();
  3887. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  3888. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  3889. "htmlParseDocument: context error\n", NULL, NULL);
  3890. return(XML_ERR_INTERNAL_ERROR);
  3891. }
  3892. ctxt->html = 1;
  3893. ctxt->linenumbers = 1;
  3894. GROW;
  3895. /*
  3896. * SAX: beginning of the document processing.
  3897. */
  3898. if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
  3899. ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
  3900. if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
  3901. ((ctxt->input->end - ctxt->input->cur) >= 4)) {
  3902. /*
  3903. * Get the 4 first bytes and decode the charset
  3904. * if enc != XML_CHAR_ENCODING_NONE
  3905. * plug some encoding conversion routines.
  3906. */
  3907. start[0] = RAW;
  3908. start[1] = NXT(1);
  3909. start[2] = NXT(2);
  3910. start[3] = NXT(3);
  3911. enc = xmlDetectCharEncoding(&start[0], 4);
  3912. if (enc != XML_CHAR_ENCODING_NONE) {
  3913. xmlSwitchEncoding(ctxt, enc);
  3914. }
  3915. }
  3916. /*
  3917. * Wipe out everything which is before the first '<'
  3918. */
  3919. SKIP_BLANKS;
  3920. if (CUR == 0) {
  3921. htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
  3922. "Document is empty\n", NULL, NULL);
  3923. }
  3924. if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
  3925. ctxt->sax->startDocument(ctxt->userData);
  3926. /*
  3927. * Parse possible comments and PIs before any content
  3928. */
  3929. while (((CUR == '<') && (NXT(1) == '!') &&
  3930. (NXT(2) == '-') && (NXT(3) == '-')) ||
  3931. ((CUR == '<') && (NXT(1) == '?'))) {
  3932. htmlParseComment(ctxt);
  3933. htmlParsePI(ctxt);
  3934. SKIP_BLANKS;
  3935. }
  3936. /*
  3937. * Then possibly doc type declaration(s) and more Misc
  3938. * (doctypedecl Misc*)?
  3939. */
  3940. if ((CUR == '<') && (NXT(1) == '!') &&
  3941. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  3942. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  3943. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  3944. (UPP(8) == 'E')) {
  3945. htmlParseDocTypeDecl(ctxt);
  3946. }
  3947. SKIP_BLANKS;
  3948. /*
  3949. * Parse possible comments and PIs before any content
  3950. */
  3951. while (((CUR == '<') && (NXT(1) == '!') &&
  3952. (NXT(2) == '-') && (NXT(3) == '-')) ||
  3953. ((CUR == '<') && (NXT(1) == '?'))) {
  3954. htmlParseComment(ctxt);
  3955. htmlParsePI(ctxt);
  3956. SKIP_BLANKS;
  3957. }
  3958. /*
  3959. * Time to start parsing the tree itself
  3960. */
  3961. htmlParseContent(ctxt);
  3962. /*
  3963. * autoclose
  3964. */
  3965. if (CUR == 0)
  3966. htmlAutoCloseOnEnd(ctxt);
  3967. /*
  3968. * SAX: end of the document processing.
  3969. */
  3970. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  3971. ctxt->sax->endDocument(ctxt->userData);
  3972. if (ctxt->myDoc != NULL) {
  3973. dtd = xmlGetIntSubset(ctxt->myDoc);
  3974. if (dtd == NULL)
  3975. ctxt->myDoc->intSubset =
  3976. xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
  3977. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
  3978. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
  3979. }
  3980. if (! ctxt->wellFormed) return(-1);
  3981. return(0);
  3982. }
  3983. /************************************************************************
  3984. * *
  3985. * Parser contexts handling *
  3986. * *
  3987. ************************************************************************/
  3988. /**
  3989. * htmlInitParserCtxt:
  3990. * @ctxt: an HTML parser context
  3991. *
  3992. * Initialize a parser context
  3993. *
  3994. * Returns 0 in case of success and -1 in case of error
  3995. */
  3996. static int
  3997. htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
  3998. {
  3999. htmlSAXHandler *sax;
  4000. if (ctxt == NULL) return(-1);
  4001. memset(ctxt, 0, sizeof(htmlParserCtxt));
  4002. ctxt->dict = xmlDictCreate();
  4003. if (ctxt->dict == NULL) {
  4004. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4005. return(-1);
  4006. }
  4007. sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
  4008. if (sax == NULL) {
  4009. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4010. return(-1);
  4011. }
  4012. else
  4013. memset(sax, 0, sizeof(htmlSAXHandler));
  4014. /* Allocate the Input stack */
  4015. ctxt->inputTab = (htmlParserInputPtr *)
  4016. xmlMalloc(5 * sizeof(htmlParserInputPtr));
  4017. if (ctxt->inputTab == NULL) {
  4018. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4019. ctxt->inputNr = 0;
  4020. ctxt->inputMax = 0;
  4021. ctxt->input = NULL;
  4022. return(-1);
  4023. }
  4024. ctxt->inputNr = 0;
  4025. ctxt->inputMax = 5;
  4026. ctxt->input = NULL;
  4027. ctxt->version = NULL;
  4028. ctxt->encoding = NULL;
  4029. ctxt->standalone = -1;
  4030. ctxt->instate = XML_PARSER_START;
  4031. /* Allocate the Node stack */
  4032. ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
  4033. if (ctxt->nodeTab == NULL) {
  4034. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4035. ctxt->nodeNr = 0;
  4036. ctxt->nodeMax = 0;
  4037. ctxt->node = NULL;
  4038. ctxt->inputNr = 0;
  4039. ctxt->inputMax = 0;
  4040. ctxt->input = NULL;
  4041. return(-1);
  4042. }
  4043. ctxt->nodeNr = 0;
  4044. ctxt->nodeMax = 10;
  4045. ctxt->node = NULL;
  4046. /* Allocate the Name stack */
  4047. ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
  4048. if (ctxt->nameTab == NULL) {
  4049. htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
  4050. ctxt->nameNr = 0;
  4051. ctxt->nameMax = 10;
  4052. ctxt->name = NULL;
  4053. ctxt->nodeNr = 0;
  4054. ctxt->nodeMax = 0;
  4055. ctxt->node = NULL;
  4056. ctxt->inputNr = 0;
  4057. ctxt->inputMax = 0;
  4058. ctxt->input = NULL;
  4059. return(-1);
  4060. }
  4061. ctxt->nameNr = 0;
  4062. ctxt->nameMax = 10;
  4063. ctxt->name = NULL;
  4064. if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
  4065. else {
  4066. ctxt->sax = sax;
  4067. memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
  4068. }
  4069. ctxt->userData = ctxt;
  4070. ctxt->myDoc = NULL;
  4071. ctxt->wellFormed = 1;
  4072. ctxt->replaceEntities = 0;
  4073. ctxt->linenumbers = xmlLineNumbersDefaultValue;
  4074. ctxt->html = 1;
  4075. ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
  4076. ctxt->vctxt.userData = ctxt;
  4077. ctxt->vctxt.error = xmlParserValidityError;
  4078. ctxt->vctxt.warning = xmlParserValidityWarning;
  4079. ctxt->record_info = 0;
  4080. ctxt->validate = 0;
  4081. ctxt->nbChars = 0;
  4082. ctxt->checkIndex = 0;
  4083. ctxt->catalogs = NULL;
  4084. xmlInitNodeInfoSeq(&ctxt->node_seq);
  4085. return(0);
  4086. }
  4087. /**
  4088. * htmlFreeParserCtxt:
  4089. * @ctxt: an HTML parser context
  4090. *
  4091. * Free all the memory used by a parser context. However the parsed
  4092. * document in ctxt->myDoc is not freed.
  4093. */
  4094. void
  4095. htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
  4096. {
  4097. xmlFreeParserCtxt(ctxt);
  4098. }
  4099. /**
  4100. * htmlNewParserCtxt:
  4101. *
  4102. * Allocate and initialize a new parser context.
  4103. *
  4104. * Returns the htmlParserCtxtPtr or NULL in case of allocation error
  4105. */
  4106. htmlParserCtxtPtr
  4107. htmlNewParserCtxt(void)
  4108. {
  4109. xmlParserCtxtPtr ctxt;
  4110. ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
  4111. if (ctxt == NULL) {
  4112. htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
  4113. return(NULL);
  4114. }
  4115. memset(ctxt, 0, sizeof(xmlParserCtxt));
  4116. if (htmlInitParserCtxt(ctxt) < 0) {
  4117. htmlFreeParserCtxt(ctxt);
  4118. return(NULL);
  4119. }
  4120. return(ctxt);
  4121. }
  4122. /**
  4123. * htmlCreateMemoryParserCtxt:
  4124. * @buffer: a pointer to a char array
  4125. * @size: the size of the array
  4126. *
  4127. * Create a parser context for an HTML in-memory document.
  4128. *
  4129. * Returns the new parser context or NULL
  4130. */
  4131. htmlParserCtxtPtr
  4132. htmlCreateMemoryParserCtxt(const char *buffer, int size) {
  4133. xmlParserCtxtPtr ctxt;
  4134. xmlParserInputPtr input;
  4135. xmlParserInputBufferPtr buf;
  4136. if (buffer == NULL)
  4137. return(NULL);
  4138. if (size <= 0)
  4139. return(NULL);
  4140. ctxt = htmlNewParserCtxt();
  4141. if (ctxt == NULL)
  4142. return(NULL);
  4143. buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
  4144. if (buf == NULL) return(NULL);
  4145. input = xmlNewInputStream(ctxt);
  4146. if (input == NULL) {
  4147. xmlFreeParserCtxt(ctxt);
  4148. return(NULL);
  4149. }
  4150. input->filename = NULL;
  4151. input->buf = buf;
  4152. input->base = input->buf->buffer->content;
  4153. input->cur = input->buf->buffer->content;
  4154. input->end = &input->buf->buffer->content[input->buf->buffer->use];
  4155. inputPush(ctxt, input);
  4156. return(ctxt);
  4157. }
  4158. /**
  4159. * htmlCreateDocParserCtxt:
  4160. * @cur: a pointer to an array of xmlChar
  4161. * @encoding: a free form C string describing the HTML document encoding, or NULL
  4162. *
  4163. * Create a parser context for an HTML document.
  4164. *
  4165. * TODO: check the need to add encoding handling there
  4166. *
  4167. * Returns the new parser context or NULL
  4168. */
  4169. static htmlParserCtxtPtr
  4170. htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
  4171. int len;
  4172. htmlParserCtxtPtr ctxt;
  4173. if (cur == NULL)
  4174. return(NULL);
  4175. len = xmlStrlen(cur);
  4176. ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
  4177. if (ctxt == NULL)
  4178. return(NULL);
  4179. if (encoding != NULL) {
  4180. xmlCharEncoding enc;
  4181. xmlCharEncodingHandlerPtr handler;
  4182. if (ctxt->input->encoding != NULL)
  4183. xmlFree((xmlChar *) ctxt->input->encoding);
  4184. ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
  4185. enc = xmlParseCharEncoding(encoding);
  4186. /*
  4187. * registered set of known encodings
  4188. */
  4189. if (enc != XML_CHAR_ENCODING_ERROR) {
  4190. xmlSwitchEncoding(ctxt, enc);
  4191. if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
  4192. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  4193. "Unsupported encoding %s\n",
  4194. (const xmlChar *) encoding, NULL);
  4195. }
  4196. } else {
  4197. /*
  4198. * fallback for unknown encodings
  4199. */
  4200. handler = xmlFindCharEncodingHandler((const char *) encoding);
  4201. if (handler != NULL) {
  4202. xmlSwitchToEncoding(ctxt, handler);
  4203. } else {
  4204. htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
  4205. "Unsupported encoding %s\n",
  4206. (const xmlChar *) encoding, NULL);
  4207. }
  4208. }
  4209. }
  4210. return(ctxt);
  4211. }
  4212. #ifdef LIBXML_PUSH_ENABLED
  4213. /************************************************************************
  4214. * *
  4215. * Progressive parsing interfaces *
  4216. * *
  4217. ************************************************************************/
  4218. /**
  4219. * htmlParseLookupSequence:
  4220. * @ctxt: an HTML parser context
  4221. * @first: the first char to lookup
  4222. * @next: the next char to lookup or zero
  4223. * @third: the next char to lookup or zero
  4224. * @comment: flag to force checking inside comments
  4225. *
  4226. * Try to find if a sequence (first, next, third) or just (first next) or
  4227. * (first) is available in the input stream.
  4228. * This function has a side effect of (possibly) incrementing ctxt->checkIndex
  4229. * to avoid rescanning sequences of bytes, it DOES change the state of the
  4230. * parser, do not use liberally.
  4231. * This is basically similar to xmlParseLookupSequence()
  4232. *
  4233. * Returns the index to the current parsing point if the full sequence
  4234. * is available, -1 otherwise.
  4235. */
  4236. static int
  4237. htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
  4238. xmlChar next, xmlChar third, int iscomment,
  4239. int ignoreattrval)
  4240. {
  4241. int base, len;
  4242. htmlParserInputPtr in;
  4243. const xmlChar *buf;
  4244. int incomment = 0;
  4245. int invalue = 0;
  4246. char valdellim = 0x0;
  4247. in = ctxt->input;
  4248. if (in == NULL)
  4249. return (-1);
  4250. base = in->cur - in->base;
  4251. if (base < 0)
  4252. return (-1);
  4253. if (ctxt->checkIndex > base)
  4254. base = ctxt->checkIndex;
  4255. if (in->buf == NULL) {
  4256. buf = in->base;
  4257. len = in->length;
  4258. } else {
  4259. buf = in->buf->buffer->content;
  4260. len = in->buf->buffer->use;
  4261. }
  4262. /* take into account the sequence length */
  4263. if (third)
  4264. len -= 2;
  4265. else if (next)
  4266. len--;
  4267. for (; base < len; base++) {
  4268. if ((!incomment) && (base + 4 < len) && (!iscomment)) {
  4269. if ((buf[base] == '<') && (buf[base + 1] == '!') &&
  4270. (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
  4271. incomment = 1;
  4272. /* do not increment past <! - some people use <!--> */
  4273. base += 2;
  4274. }
  4275. }
  4276. if (ignoreattrval) {
  4277. if (buf[base] == '"' || buf[base] == '\'') {
  4278. if (invalue) {
  4279. if (buf[base] == valdellim) {
  4280. invalue = 0;
  4281. continue;
  4282. }
  4283. } else {
  4284. valdellim = buf[base];
  4285. invalue = 1;
  4286. continue;
  4287. }
  4288. } else if (invalue) {
  4289. continue;
  4290. }
  4291. }
  4292. if (incomment) {
  4293. if (base + 3 > len)
  4294. return (-1);
  4295. if ((buf[base] == '-') && (buf[base + 1] == '-') &&
  4296. (buf[base + 2] == '>')) {
  4297. incomment = 0;
  4298. base += 2;
  4299. }
  4300. continue;
  4301. }
  4302. if (buf[base] == first) {
  4303. if (third != 0) {
  4304. if ((buf[base + 1] != next) || (buf[base + 2] != third))
  4305. continue;
  4306. } else if (next != 0) {
  4307. if (buf[base + 1] != next)
  4308. continue;
  4309. }
  4310. ctxt->checkIndex = 0;
  4311. #ifdef DEBUG_PUSH
  4312. if (next == 0)
  4313. xmlGenericError(xmlGenericErrorContext,
  4314. "HPP: lookup '%c' found at %d\n",
  4315. first, base);
  4316. else if (third == 0)
  4317. xmlGenericError(xmlGenericErrorContext,
  4318. "HPP: lookup '%c%c' found at %d\n",
  4319. first, next, base);
  4320. else
  4321. xmlGenericError(xmlGenericErrorContext,
  4322. "HPP: lookup '%c%c%c' found at %d\n",
  4323. first, next, third, base);
  4324. #endif
  4325. return (base - (in->cur - in->base));
  4326. }
  4327. }
  4328. if ((!incomment) && (!invalue))
  4329. ctxt->checkIndex = base;
  4330. #ifdef DEBUG_PUSH
  4331. if (next == 0)
  4332. xmlGenericError(xmlGenericErrorContext,
  4333. "HPP: lookup '%c' failed\n", first);
  4334. else if (third == 0)
  4335. xmlGenericError(xmlGenericErrorContext,
  4336. "HPP: lookup '%c%c' failed\n", first, next);
  4337. else
  4338. xmlGenericError(xmlGenericErrorContext,
  4339. "HPP: lookup '%c%c%c' failed\n", first, next,
  4340. third);
  4341. #endif
  4342. return (-1);
  4343. }
  4344. /**
  4345. * htmlParseLookupChars:
  4346. * @ctxt: an HTML parser context
  4347. * @stop: Array of chars, which stop the lookup.
  4348. * @stopLen: Length of stop-Array
  4349. *
  4350. * Try to find if any char of the stop-Array is available in the input
  4351. * stream.
  4352. * This function has a side effect of (possibly) incrementing ctxt->checkIndex
  4353. * to avoid rescanning sequences of bytes, it DOES change the state of the
  4354. * parser, do not use liberally.
  4355. *
  4356. * Returns the index to the current parsing point if a stopChar
  4357. * is available, -1 otherwise.
  4358. */
  4359. static int
  4360. htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
  4361. int stopLen)
  4362. {
  4363. int base, len;
  4364. htmlParserInputPtr in;
  4365. const xmlChar *buf;
  4366. int incomment = 0;
  4367. int i;
  4368. in = ctxt->input;
  4369. if (in == NULL)
  4370. return (-1);
  4371. base = in->cur - in->base;
  4372. if (base < 0)
  4373. return (-1);
  4374. if (ctxt->checkIndex > base)
  4375. base = ctxt->checkIndex;
  4376. if (in->buf == NULL) {
  4377. buf = in->base;
  4378. len = in->length;
  4379. } else {
  4380. buf = in->buf->buffer->content;
  4381. len = in->buf->buffer->use;
  4382. }
  4383. for (; base < len; base++) {
  4384. if (!incomment && (base + 4 < len)) {
  4385. if ((buf[base] == '<') && (buf[base + 1] == '!') &&
  4386. (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
  4387. incomment = 1;
  4388. /* do not increment past <! - some people use <!--> */
  4389. base += 2;
  4390. }
  4391. }
  4392. if (incomment) {
  4393. if (base + 3 > len)
  4394. return (-1);
  4395. if ((buf[base] == '-') && (buf[base + 1] == '-') &&
  4396. (buf[base + 2] == '>')) {
  4397. incomment = 0;
  4398. base += 2;
  4399. }
  4400. continue;
  4401. }
  4402. for (i = 0; i < stopLen; ++i) {
  4403. if (buf[base] == stop[i]) {
  4404. ctxt->checkIndex = 0;
  4405. return (base - (in->cur - in->base));
  4406. }
  4407. }
  4408. }
  4409. ctxt->checkIndex = base;
  4410. return (-1);
  4411. }
  4412. /**
  4413. * htmlParseTryOrFinish:
  4414. * @ctxt: an HTML parser context
  4415. * @terminate: last chunk indicator
  4416. *
  4417. * Try to progress on parsing
  4418. *
  4419. * Returns zero if no parsing was possible
  4420. */
  4421. static int
  4422. htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
  4423. int ret = 0;
  4424. htmlParserInputPtr in;
  4425. int avail = 0;
  4426. xmlChar cur, next;
  4427. #ifdef DEBUG_PUSH
  4428. switch (ctxt->instate) {
  4429. case XML_PARSER_EOF:
  4430. xmlGenericError(xmlGenericErrorContext,
  4431. "HPP: try EOF\n"); break;
  4432. case XML_PARSER_START:
  4433. xmlGenericError(xmlGenericErrorContext,
  4434. "HPP: try START\n"); break;
  4435. case XML_PARSER_MISC:
  4436. xmlGenericError(xmlGenericErrorContext,
  4437. "HPP: try MISC\n");break;
  4438. case XML_PARSER_COMMENT:
  4439. xmlGenericError(xmlGenericErrorContext,
  4440. "HPP: try COMMENT\n");break;
  4441. case XML_PARSER_PROLOG:
  4442. xmlGenericError(xmlGenericErrorContext,
  4443. "HPP: try PROLOG\n");break;
  4444. case XML_PARSER_START_TAG:
  4445. xmlGenericError(xmlGenericErrorContext,
  4446. "HPP: try START_TAG\n");break;
  4447. case XML_PARSER_CONTENT:
  4448. xmlGenericError(xmlGenericErrorContext,
  4449. "HPP: try CONTENT\n");break;
  4450. case XML_PARSER_CDATA_SECTION:
  4451. xmlGenericError(xmlGenericErrorContext,
  4452. "HPP: try CDATA_SECTION\n");break;
  4453. case XML_PARSER_END_TAG:
  4454. xmlGenericError(xmlGenericErrorContext,
  4455. "HPP: try END_TAG\n");break;
  4456. case XML_PARSER_ENTITY_DECL:
  4457. xmlGenericError(xmlGenericErrorContext,
  4458. "HPP: try ENTITY_DECL\n");break;
  4459. case XML_PARSER_ENTITY_VALUE:
  4460. xmlGenericError(xmlGenericErrorContext,
  4461. "HPP: try ENTITY_VALUE\n");break;
  4462. case XML_PARSER_ATTRIBUTE_VALUE:
  4463. xmlGenericError(xmlGenericErrorContext,
  4464. "HPP: try ATTRIBUTE_VALUE\n");break;
  4465. case XML_PARSER_DTD:
  4466. xmlGenericError(xmlGenericErrorContext,
  4467. "HPP: try DTD\n");break;
  4468. case XML_PARSER_EPILOG:
  4469. xmlGenericError(xmlGenericErrorContext,
  4470. "HPP: try EPILOG\n");break;
  4471. case XML_PARSER_PI:
  4472. xmlGenericError(xmlGenericErrorContext,
  4473. "HPP: try PI\n");break;
  4474. case XML_PARSER_SYSTEM_LITERAL:
  4475. xmlGenericError(xmlGenericErrorContext,
  4476. "HPP: try SYSTEM_LITERAL\n");break;
  4477. }
  4478. #endif
  4479. while (1) {
  4480. in = ctxt->input;
  4481. if (in == NULL) break;
  4482. if (in->buf == NULL)
  4483. avail = in->length - (in->cur - in->base);
  4484. else
  4485. avail = in->buf->buffer->use - (in->cur - in->base);
  4486. if ((avail == 0) && (terminate)) {
  4487. htmlAutoCloseOnEnd(ctxt);
  4488. if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
  4489. /*
  4490. * SAX: end of the document processing.
  4491. */
  4492. ctxt->instate = XML_PARSER_EOF;
  4493. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  4494. ctxt->sax->endDocument(ctxt->userData);
  4495. }
  4496. }
  4497. if (avail < 1)
  4498. goto done;
  4499. cur = in->cur[0];
  4500. if (cur == 0) {
  4501. SKIP(1);
  4502. continue;
  4503. }
  4504. switch (ctxt->instate) {
  4505. case XML_PARSER_EOF:
  4506. /*
  4507. * Document parsing is done !
  4508. */
  4509. goto done;
  4510. case XML_PARSER_START:
  4511. /*
  4512. * Very first chars read from the document flow.
  4513. */
  4514. cur = in->cur[0];
  4515. if (IS_BLANK_CH(cur)) {
  4516. SKIP_BLANKS;
  4517. if (in->buf == NULL)
  4518. avail = in->length - (in->cur - in->base);
  4519. else
  4520. avail = in->buf->buffer->use - (in->cur - in->base);
  4521. }
  4522. if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
  4523. ctxt->sax->setDocumentLocator(ctxt->userData,
  4524. &xmlDefaultSAXLocator);
  4525. if ((ctxt->sax) && (ctxt->sax->startDocument) &&
  4526. (!ctxt->disableSAX))
  4527. ctxt->sax->startDocument(ctxt->userData);
  4528. cur = in->cur[0];
  4529. next = in->cur[1];
  4530. if ((cur == '<') && (next == '!') &&
  4531. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4532. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4533. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4534. (UPP(8) == 'E')) {
  4535. if ((!terminate) &&
  4536. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4537. goto done;
  4538. #ifdef DEBUG_PUSH
  4539. xmlGenericError(xmlGenericErrorContext,
  4540. "HPP: Parsing internal subset\n");
  4541. #endif
  4542. htmlParseDocTypeDecl(ctxt);
  4543. ctxt->instate = XML_PARSER_PROLOG;
  4544. #ifdef DEBUG_PUSH
  4545. xmlGenericError(xmlGenericErrorContext,
  4546. "HPP: entering PROLOG\n");
  4547. #endif
  4548. } else {
  4549. ctxt->instate = XML_PARSER_MISC;
  4550. #ifdef DEBUG_PUSH
  4551. xmlGenericError(xmlGenericErrorContext,
  4552. "HPP: entering MISC\n");
  4553. #endif
  4554. }
  4555. break;
  4556. case XML_PARSER_MISC:
  4557. SKIP_BLANKS;
  4558. if (in->buf == NULL)
  4559. avail = in->length - (in->cur - in->base);
  4560. else
  4561. avail = in->buf->buffer->use - (in->cur - in->base);
  4562. if (avail < 2)
  4563. goto done;
  4564. cur = in->cur[0];
  4565. next = in->cur[1];
  4566. if ((cur == '<') && (next == '!') &&
  4567. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  4568. if ((!terminate) &&
  4569. (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
  4570. goto done;
  4571. #ifdef DEBUG_PUSH
  4572. xmlGenericError(xmlGenericErrorContext,
  4573. "HPP: Parsing Comment\n");
  4574. #endif
  4575. htmlParseComment(ctxt);
  4576. ctxt->instate = XML_PARSER_MISC;
  4577. } else if ((cur == '<') && (next == '?')) {
  4578. if ((!terminate) &&
  4579. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4580. goto done;
  4581. #ifdef DEBUG_PUSH
  4582. xmlGenericError(xmlGenericErrorContext,
  4583. "HPP: Parsing PI\n");
  4584. #endif
  4585. htmlParsePI(ctxt);
  4586. ctxt->instate = XML_PARSER_MISC;
  4587. } else if ((cur == '<') && (next == '!') &&
  4588. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4589. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4590. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4591. (UPP(8) == 'E')) {
  4592. if ((!terminate) &&
  4593. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4594. goto done;
  4595. #ifdef DEBUG_PUSH
  4596. xmlGenericError(xmlGenericErrorContext,
  4597. "HPP: Parsing internal subset\n");
  4598. #endif
  4599. htmlParseDocTypeDecl(ctxt);
  4600. ctxt->instate = XML_PARSER_PROLOG;
  4601. #ifdef DEBUG_PUSH
  4602. xmlGenericError(xmlGenericErrorContext,
  4603. "HPP: entering PROLOG\n");
  4604. #endif
  4605. } else if ((cur == '<') && (next == '!') &&
  4606. (avail < 9)) {
  4607. goto done;
  4608. } else {
  4609. ctxt->instate = XML_PARSER_START_TAG;
  4610. #ifdef DEBUG_PUSH
  4611. xmlGenericError(xmlGenericErrorContext,
  4612. "HPP: entering START_TAG\n");
  4613. #endif
  4614. }
  4615. break;
  4616. case XML_PARSER_PROLOG:
  4617. SKIP_BLANKS;
  4618. if (in->buf == NULL)
  4619. avail = in->length - (in->cur - in->base);
  4620. else
  4621. avail = in->buf->buffer->use - (in->cur - in->base);
  4622. if (avail < 2)
  4623. goto done;
  4624. cur = in->cur[0];
  4625. next = in->cur[1];
  4626. if ((cur == '<') && (next == '!') &&
  4627. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  4628. if ((!terminate) &&
  4629. (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
  4630. goto done;
  4631. #ifdef DEBUG_PUSH
  4632. xmlGenericError(xmlGenericErrorContext,
  4633. "HPP: Parsing Comment\n");
  4634. #endif
  4635. htmlParseComment(ctxt);
  4636. ctxt->instate = XML_PARSER_PROLOG;
  4637. } else if ((cur == '<') && (next == '?')) {
  4638. if ((!terminate) &&
  4639. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4640. goto done;
  4641. #ifdef DEBUG_PUSH
  4642. xmlGenericError(xmlGenericErrorContext,
  4643. "HPP: Parsing PI\n");
  4644. #endif
  4645. htmlParsePI(ctxt);
  4646. ctxt->instate = XML_PARSER_PROLOG;
  4647. } else if ((cur == '<') && (next == '!') &&
  4648. (avail < 4)) {
  4649. goto done;
  4650. } else {
  4651. ctxt->instate = XML_PARSER_START_TAG;
  4652. #ifdef DEBUG_PUSH
  4653. xmlGenericError(xmlGenericErrorContext,
  4654. "HPP: entering START_TAG\n");
  4655. #endif
  4656. }
  4657. break;
  4658. case XML_PARSER_EPILOG:
  4659. if (in->buf == NULL)
  4660. avail = in->length - (in->cur - in->base);
  4661. else
  4662. avail = in->buf->buffer->use - (in->cur - in->base);
  4663. if (avail < 1)
  4664. goto done;
  4665. cur = in->cur[0];
  4666. if (IS_BLANK_CH(cur)) {
  4667. htmlParseCharData(ctxt);
  4668. goto done;
  4669. }
  4670. if (avail < 2)
  4671. goto done;
  4672. next = in->cur[1];
  4673. if ((cur == '<') && (next == '!') &&
  4674. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  4675. if ((!terminate) &&
  4676. (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
  4677. goto done;
  4678. #ifdef DEBUG_PUSH
  4679. xmlGenericError(xmlGenericErrorContext,
  4680. "HPP: Parsing Comment\n");
  4681. #endif
  4682. htmlParseComment(ctxt);
  4683. ctxt->instate = XML_PARSER_EPILOG;
  4684. } else if ((cur == '<') && (next == '?')) {
  4685. if ((!terminate) &&
  4686. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4687. goto done;
  4688. #ifdef DEBUG_PUSH
  4689. xmlGenericError(xmlGenericErrorContext,
  4690. "HPP: Parsing PI\n");
  4691. #endif
  4692. htmlParsePI(ctxt);
  4693. ctxt->instate = XML_PARSER_EPILOG;
  4694. } else if ((cur == '<') && (next == '!') &&
  4695. (avail < 4)) {
  4696. goto done;
  4697. } else {
  4698. ctxt->errNo = XML_ERR_DOCUMENT_END;
  4699. ctxt->wellFormed = 0;
  4700. ctxt->instate = XML_PARSER_EOF;
  4701. #ifdef DEBUG_PUSH
  4702. xmlGenericError(xmlGenericErrorContext,
  4703. "HPP: entering EOF\n");
  4704. #endif
  4705. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  4706. ctxt->sax->endDocument(ctxt->userData);
  4707. goto done;
  4708. }
  4709. break;
  4710. case XML_PARSER_START_TAG: {
  4711. const xmlChar *name;
  4712. int failed;
  4713. const htmlElemDesc * info;
  4714. if (avail < 2)
  4715. goto done;
  4716. cur = in->cur[0];
  4717. if (cur != '<') {
  4718. ctxt->instate = XML_PARSER_CONTENT;
  4719. #ifdef DEBUG_PUSH
  4720. xmlGenericError(xmlGenericErrorContext,
  4721. "HPP: entering CONTENT\n");
  4722. #endif
  4723. break;
  4724. }
  4725. if (in->cur[1] == '/') {
  4726. ctxt->instate = XML_PARSER_END_TAG;
  4727. ctxt->checkIndex = 0;
  4728. #ifdef DEBUG_PUSH
  4729. xmlGenericError(xmlGenericErrorContext,
  4730. "HPP: entering END_TAG\n");
  4731. #endif
  4732. break;
  4733. }
  4734. if ((!terminate) &&
  4735. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4736. goto done;
  4737. failed = htmlParseStartTag(ctxt);
  4738. name = ctxt->name;
  4739. if ((failed == -1) ||
  4740. (name == NULL)) {
  4741. if (CUR == '>')
  4742. NEXT;
  4743. break;
  4744. }
  4745. /*
  4746. * Lookup the info for that element.
  4747. */
  4748. info = htmlTagLookup(name);
  4749. if (info == NULL) {
  4750. htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
  4751. "Tag %s invalid\n", name, NULL);
  4752. }
  4753. /*
  4754. * Check for an Empty Element labeled the XML/SGML way
  4755. */
  4756. if ((CUR == '/') && (NXT(1) == '>')) {
  4757. SKIP(2);
  4758. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4759. ctxt->sax->endElement(ctxt->userData, name);
  4760. htmlnamePop(ctxt);
  4761. ctxt->instate = XML_PARSER_CONTENT;
  4762. #ifdef DEBUG_PUSH
  4763. xmlGenericError(xmlGenericErrorContext,
  4764. "HPP: entering CONTENT\n");
  4765. #endif
  4766. break;
  4767. }
  4768. if (CUR == '>') {
  4769. NEXT;
  4770. } else {
  4771. htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
  4772. "Couldn't find end of Start Tag %s\n",
  4773. name, NULL);
  4774. /*
  4775. * end of parsing of this node.
  4776. */
  4777. if (xmlStrEqual(name, ctxt->name)) {
  4778. nodePop(ctxt);
  4779. htmlnamePop(ctxt);
  4780. }
  4781. ctxt->instate = XML_PARSER_CONTENT;
  4782. #ifdef DEBUG_PUSH
  4783. xmlGenericError(xmlGenericErrorContext,
  4784. "HPP: entering CONTENT\n");
  4785. #endif
  4786. break;
  4787. }
  4788. /*
  4789. * Check for an Empty Element from DTD definition
  4790. */
  4791. if ((info != NULL) && (info->empty)) {
  4792. if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
  4793. ctxt->sax->endElement(ctxt->userData, name);
  4794. htmlnamePop(ctxt);
  4795. }
  4796. ctxt->instate = XML_PARSER_CONTENT;
  4797. #ifdef DEBUG_PUSH
  4798. xmlGenericError(xmlGenericErrorContext,
  4799. "HPP: entering CONTENT\n");
  4800. #endif
  4801. break;
  4802. }
  4803. case XML_PARSER_CONTENT: {
  4804. long cons;
  4805. /*
  4806. * Handle preparsed entities and charRef
  4807. */
  4808. if (ctxt->token != 0) {
  4809. xmlChar chr[2] = { 0 , 0 } ;
  4810. chr[0] = (xmlChar) ctxt->token;
  4811. htmlCheckParagraph(ctxt);
  4812. if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
  4813. ctxt->sax->characters(ctxt->userData, chr, 1);
  4814. ctxt->token = 0;
  4815. ctxt->checkIndex = 0;
  4816. }
  4817. if ((avail == 1) && (terminate)) {
  4818. cur = in->cur[0];
  4819. if ((cur != '<') && (cur != '&')) {
  4820. if (ctxt->sax != NULL) {
  4821. if (IS_BLANK_CH(cur)) {
  4822. if (ctxt->sax->ignorableWhitespace != NULL)
  4823. ctxt->sax->ignorableWhitespace(
  4824. ctxt->userData, &cur, 1);
  4825. } else {
  4826. htmlCheckParagraph(ctxt);
  4827. if (ctxt->sax->characters != NULL)
  4828. ctxt->sax->characters(
  4829. ctxt->userData, &cur, 1);
  4830. }
  4831. }
  4832. ctxt->token = 0;
  4833. ctxt->checkIndex = 0;
  4834. in->cur++;
  4835. break;
  4836. }
  4837. }
  4838. if (avail < 2)
  4839. goto done;
  4840. cur = in->cur[0];
  4841. next = in->cur[1];
  4842. cons = ctxt->nbChars;
  4843. if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
  4844. (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
  4845. /*
  4846. * Handle SCRIPT/STYLE separately
  4847. */
  4848. if (!terminate) {
  4849. int idx;
  4850. xmlChar val;
  4851. idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
  4852. if (idx < 0)
  4853. goto done;
  4854. val = in->cur[idx + 2];
  4855. if (val == 0) /* bad cut of input */
  4856. goto done;
  4857. }
  4858. htmlParseScript(ctxt);
  4859. if ((cur == '<') && (next == '/')) {
  4860. ctxt->instate = XML_PARSER_END_TAG;
  4861. ctxt->checkIndex = 0;
  4862. #ifdef DEBUG_PUSH
  4863. xmlGenericError(xmlGenericErrorContext,
  4864. "HPP: entering END_TAG\n");
  4865. #endif
  4866. break;
  4867. }
  4868. } else {
  4869. /*
  4870. * Sometimes DOCTYPE arrives in the middle of the document
  4871. */
  4872. if ((cur == '<') && (next == '!') &&
  4873. (UPP(2) == 'D') && (UPP(3) == 'O') &&
  4874. (UPP(4) == 'C') && (UPP(5) == 'T') &&
  4875. (UPP(6) == 'Y') && (UPP(7) == 'P') &&
  4876. (UPP(8) == 'E')) {
  4877. if ((!terminate) &&
  4878. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4879. goto done;
  4880. htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
  4881. "Misplaced DOCTYPE declaration\n",
  4882. BAD_CAST "DOCTYPE" , NULL);
  4883. htmlParseDocTypeDecl(ctxt);
  4884. } else if ((cur == '<') && (next == '!') &&
  4885. (in->cur[2] == '-') && (in->cur[3] == '-')) {
  4886. if ((!terminate) &&
  4887. (htmlParseLookupSequence(
  4888. ctxt, '-', '-', '>', 1, 1) < 0))
  4889. goto done;
  4890. #ifdef DEBUG_PUSH
  4891. xmlGenericError(xmlGenericErrorContext,
  4892. "HPP: Parsing Comment\n");
  4893. #endif
  4894. htmlParseComment(ctxt);
  4895. ctxt->instate = XML_PARSER_CONTENT;
  4896. } else if ((cur == '<') && (next == '?')) {
  4897. if ((!terminate) &&
  4898. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4899. goto done;
  4900. #ifdef DEBUG_PUSH
  4901. xmlGenericError(xmlGenericErrorContext,
  4902. "HPP: Parsing PI\n");
  4903. #endif
  4904. htmlParsePI(ctxt);
  4905. ctxt->instate = XML_PARSER_CONTENT;
  4906. } else if ((cur == '<') && (next == '!') && (avail < 4)) {
  4907. goto done;
  4908. } else if ((cur == '<') && (next == '/')) {
  4909. ctxt->instate = XML_PARSER_END_TAG;
  4910. ctxt->checkIndex = 0;
  4911. #ifdef DEBUG_PUSH
  4912. xmlGenericError(xmlGenericErrorContext,
  4913. "HPP: entering END_TAG\n");
  4914. #endif
  4915. break;
  4916. } else if (cur == '<') {
  4917. ctxt->instate = XML_PARSER_START_TAG;
  4918. ctxt->checkIndex = 0;
  4919. #ifdef DEBUG_PUSH
  4920. xmlGenericError(xmlGenericErrorContext,
  4921. "HPP: entering START_TAG\n");
  4922. #endif
  4923. break;
  4924. } else if (cur == '&') {
  4925. if ((!terminate) &&
  4926. (htmlParseLookupChars(ctxt,
  4927. BAD_CAST "; >/", 4) < 0))
  4928. goto done;
  4929. #ifdef DEBUG_PUSH
  4930. xmlGenericError(xmlGenericErrorContext,
  4931. "HPP: Parsing Reference\n");
  4932. #endif
  4933. /* TODO: check generation of subtrees if noent !!! */
  4934. htmlParseReference(ctxt);
  4935. } else {
  4936. /*
  4937. * check that the text sequence is complete
  4938. * before handing out the data to the parser
  4939. * to avoid problems with erroneous end of
  4940. * data detection.
  4941. */
  4942. if ((!terminate) &&
  4943. (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
  4944. goto done;
  4945. ctxt->checkIndex = 0;
  4946. #ifdef DEBUG_PUSH
  4947. xmlGenericError(xmlGenericErrorContext,
  4948. "HPP: Parsing char data\n");
  4949. #endif
  4950. htmlParseCharData(ctxt);
  4951. }
  4952. }
  4953. if (cons == ctxt->nbChars) {
  4954. if (ctxt->node != NULL) {
  4955. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4956. "detected an error in element content\n",
  4957. NULL, NULL);
  4958. }
  4959. NEXT;
  4960. break;
  4961. }
  4962. break;
  4963. }
  4964. case XML_PARSER_END_TAG:
  4965. if (avail < 2)
  4966. goto done;
  4967. if ((!terminate) &&
  4968. (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
  4969. goto done;
  4970. htmlParseEndTag(ctxt);
  4971. if (ctxt->nameNr == 0) {
  4972. ctxt->instate = XML_PARSER_EPILOG;
  4973. } else {
  4974. ctxt->instate = XML_PARSER_CONTENT;
  4975. }
  4976. ctxt->checkIndex = 0;
  4977. #ifdef DEBUG_PUSH
  4978. xmlGenericError(xmlGenericErrorContext,
  4979. "HPP: entering CONTENT\n");
  4980. #endif
  4981. break;
  4982. case XML_PARSER_CDATA_SECTION:
  4983. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4984. "HPP: internal error, state == CDATA\n",
  4985. NULL, NULL);
  4986. ctxt->instate = XML_PARSER_CONTENT;
  4987. ctxt->checkIndex = 0;
  4988. #ifdef DEBUG_PUSH
  4989. xmlGenericError(xmlGenericErrorContext,
  4990. "HPP: entering CONTENT\n");
  4991. #endif
  4992. break;
  4993. case XML_PARSER_DTD:
  4994. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  4995. "HPP: internal error, state == DTD\n",
  4996. NULL, NULL);
  4997. ctxt->instate = XML_PARSER_CONTENT;
  4998. ctxt->checkIndex = 0;
  4999. #ifdef DEBUG_PUSH
  5000. xmlGenericError(xmlGenericErrorContext,
  5001. "HPP: entering CONTENT\n");
  5002. #endif
  5003. break;
  5004. case XML_PARSER_COMMENT:
  5005. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5006. "HPP: internal error, state == COMMENT\n",
  5007. NULL, NULL);
  5008. ctxt->instate = XML_PARSER_CONTENT;
  5009. ctxt->checkIndex = 0;
  5010. #ifdef DEBUG_PUSH
  5011. xmlGenericError(xmlGenericErrorContext,
  5012. "HPP: entering CONTENT\n");
  5013. #endif
  5014. break;
  5015. case XML_PARSER_PI:
  5016. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5017. "HPP: internal error, state == PI\n",
  5018. NULL, NULL);
  5019. ctxt->instate = XML_PARSER_CONTENT;
  5020. ctxt->checkIndex = 0;
  5021. #ifdef DEBUG_PUSH
  5022. xmlGenericError(xmlGenericErrorContext,
  5023. "HPP: entering CONTENT\n");
  5024. #endif
  5025. break;
  5026. case XML_PARSER_ENTITY_DECL:
  5027. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5028. "HPP: internal error, state == ENTITY_DECL\n",
  5029. NULL, NULL);
  5030. ctxt->instate = XML_PARSER_CONTENT;
  5031. ctxt->checkIndex = 0;
  5032. #ifdef DEBUG_PUSH
  5033. xmlGenericError(xmlGenericErrorContext,
  5034. "HPP: entering CONTENT\n");
  5035. #endif
  5036. break;
  5037. case XML_PARSER_ENTITY_VALUE:
  5038. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5039. "HPP: internal error, state == ENTITY_VALUE\n",
  5040. NULL, NULL);
  5041. ctxt->instate = XML_PARSER_CONTENT;
  5042. ctxt->checkIndex = 0;
  5043. #ifdef DEBUG_PUSH
  5044. xmlGenericError(xmlGenericErrorContext,
  5045. "HPP: entering DTD\n");
  5046. #endif
  5047. break;
  5048. case XML_PARSER_ATTRIBUTE_VALUE:
  5049. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5050. "HPP: internal error, state == ATTRIBUTE_VALUE\n",
  5051. NULL, NULL);
  5052. ctxt->instate = XML_PARSER_START_TAG;
  5053. ctxt->checkIndex = 0;
  5054. #ifdef DEBUG_PUSH
  5055. xmlGenericError(xmlGenericErrorContext,
  5056. "HPP: entering START_TAG\n");
  5057. #endif
  5058. break;
  5059. case XML_PARSER_SYSTEM_LITERAL:
  5060. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5061. "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
  5062. NULL, NULL);
  5063. ctxt->instate = XML_PARSER_CONTENT;
  5064. ctxt->checkIndex = 0;
  5065. #ifdef DEBUG_PUSH
  5066. xmlGenericError(xmlGenericErrorContext,
  5067. "HPP: entering CONTENT\n");
  5068. #endif
  5069. break;
  5070. case XML_PARSER_IGNORE:
  5071. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5072. "HPP: internal error, state == XML_PARSER_IGNORE\n",
  5073. NULL, NULL);
  5074. ctxt->instate = XML_PARSER_CONTENT;
  5075. ctxt->checkIndex = 0;
  5076. #ifdef DEBUG_PUSH
  5077. xmlGenericError(xmlGenericErrorContext,
  5078. "HPP: entering CONTENT\n");
  5079. #endif
  5080. break;
  5081. case XML_PARSER_PUBLIC_LITERAL:
  5082. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5083. "HPP: internal error, state == XML_PARSER_LITERAL\n",
  5084. NULL, NULL);
  5085. ctxt->instate = XML_PARSER_CONTENT;
  5086. ctxt->checkIndex = 0;
  5087. #ifdef DEBUG_PUSH
  5088. xmlGenericError(xmlGenericErrorContext,
  5089. "HPP: entering CONTENT\n");
  5090. #endif
  5091. break;
  5092. }
  5093. }
  5094. done:
  5095. if ((avail == 0) && (terminate)) {
  5096. htmlAutoCloseOnEnd(ctxt);
  5097. if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
  5098. /*
  5099. * SAX: end of the document processing.
  5100. */
  5101. ctxt->instate = XML_PARSER_EOF;
  5102. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5103. ctxt->sax->endDocument(ctxt->userData);
  5104. }
  5105. }
  5106. if ((ctxt->myDoc != NULL) &&
  5107. ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
  5108. (ctxt->instate == XML_PARSER_EPILOG))) {
  5109. xmlDtdPtr dtd;
  5110. dtd = xmlGetIntSubset(ctxt->myDoc);
  5111. if (dtd == NULL)
  5112. ctxt->myDoc->intSubset =
  5113. xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
  5114. BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
  5115. BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
  5116. }
  5117. #ifdef DEBUG_PUSH
  5118. xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
  5119. #endif
  5120. return(ret);
  5121. }
  5122. /**
  5123. * htmlParseChunk:
  5124. * @ctxt: an HTML parser context
  5125. * @chunk: an char array
  5126. * @size: the size in byte of the chunk
  5127. * @terminate: last chunk indicator
  5128. *
  5129. * Parse a Chunk of memory
  5130. *
  5131. * Returns zero if no error, the xmlParserErrors otherwise.
  5132. */
  5133. int
  5134. htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
  5135. int terminate) {
  5136. if ((ctxt == NULL) || (ctxt->input == NULL)) {
  5137. htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
  5138. "htmlParseChunk: context error\n", NULL, NULL);
  5139. return(XML_ERR_INTERNAL_ERROR);
  5140. }
  5141. if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
  5142. (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
  5143. int base = ctxt->input->base - ctxt->input->buf->buffer->content;
  5144. int cur = ctxt->input->cur - ctxt->input->base;
  5145. int res;
  5146. res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
  5147. if (res < 0) {
  5148. ctxt->errNo = XML_PARSER_EOF;
  5149. ctxt->disableSAX = 1;
  5150. return (XML_PARSER_EOF);
  5151. }
  5152. ctxt->input->base = ctxt->input->buf->buffer->content + base;
  5153. ctxt->input->cur = ctxt->input->base + cur;
  5154. ctxt->input->end =
  5155. &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
  5156. #ifdef DEBUG_PUSH
  5157. xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
  5158. #endif
  5159. #if 0
  5160. if ((terminate) || (ctxt->input->buf->buffer->use > 80))
  5161. htmlParseTryOrFinish(ctxt, terminate);
  5162. #endif
  5163. } else if (ctxt->instate != XML_PARSER_EOF) {
  5164. if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
  5165. xmlParserInputBufferPtr in = ctxt->input->buf;
  5166. if ((in->encoder != NULL) && (in->buffer != NULL) &&
  5167. (in->raw != NULL)) {
  5168. int nbchars;
  5169. nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
  5170. if (nbchars < 0) {
  5171. htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
  5172. "encoder error\n", NULL, NULL);
  5173. return(XML_ERR_INVALID_ENCODING);
  5174. }
  5175. }
  5176. }
  5177. }
  5178. htmlParseTryOrFinish(ctxt, terminate);
  5179. if (terminate) {
  5180. if ((ctxt->instate != XML_PARSER_EOF) &&
  5181. (ctxt->instate != XML_PARSER_EPILOG) &&
  5182. (ctxt->instate != XML_PARSER_MISC)) {
  5183. ctxt->errNo = XML_ERR_DOCUMENT_END;
  5184. ctxt->wellFormed = 0;
  5185. }
  5186. if (ctxt->instate != XML_PARSER_EOF) {
  5187. if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
  5188. ctxt->sax->endDocument(ctxt->userData);
  5189. }
  5190. ctxt->instate = XML_PARSER_EOF;
  5191. }
  5192. return((xmlParserErrors) ctxt->errNo);
  5193. }
  5194. /************************************************************************
  5195. * *
  5196. * User entry points *
  5197. * *
  5198. ************************************************************************/
  5199. /**
  5200. * htmlCreatePushParserCtxt:
  5201. * @sax: a SAX handler
  5202. * @user_data: The user data returned on SAX callbacks
  5203. * @chunk: a pointer to an array of chars
  5204. * @size: number of chars in the array
  5205. * @filename: an optional file name or URI
  5206. * @enc: an optional encoding
  5207. *
  5208. * Create a parser context for using the HTML parser in push mode
  5209. * The value of @filename is used for fetching external entities
  5210. * and error/warning reports.
  5211. *
  5212. * Returns the new parser context or NULL
  5213. */
  5214. htmlParserCtxtPtr
  5215. htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
  5216. const char *chunk, int size, const char *filename,
  5217. xmlCharEncoding enc) {
  5218. htmlParserCtxtPtr ctxt;
  5219. htmlParserInputPtr inputStream;
  5220. xmlParserInputBufferPtr buf;
  5221. xmlInitParser();
  5222. buf = xmlAllocParserInputBuffer(enc);
  5223. if (buf == NULL) return(NULL);
  5224. ctxt = htmlNewParserCtxt();
  5225. if (ctxt == NULL) {
  5226. xmlFreeParserInputBuffer(buf);
  5227. return(NULL);
  5228. }
  5229. if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
  5230. ctxt->charset=XML_CHAR_ENCODING_UTF8;
  5231. if (sax != NULL) {
  5232. if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
  5233. xmlFree(ctxt->sax);
  5234. ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
  5235. if (ctxt->sax == NULL) {
  5236. xmlFree(buf);
  5237. xmlFree(ctxt);
  5238. return(NULL);
  5239. }
  5240. memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
  5241. if (user_data != NULL)
  5242. ctxt->userData = user_data;
  5243. }
  5244. if (filename == NULL) {
  5245. ctxt->directory = NULL;
  5246. } else {
  5247. ctxt->directory = xmlParserGetDirectory(filename);
  5248. }
  5249. inputStream = htmlNewInputStream(ctxt);
  5250. if (inputStream == NULL) {
  5251. xmlFreeParserCtxt(ctxt);
  5252. xmlFree(buf);
  5253. return(NULL);
  5254. }
  5255. if (filename == NULL)
  5256. inputStream->filename = NULL;
  5257. else
  5258. inputStream->filename = (char *)
  5259. xmlCanonicPath((const xmlChar *) filename);
  5260. inputStream->buf = buf;
  5261. inputStream->base = inputStream->buf->buffer->content;
  5262. inputStream->cur = inputStream->buf->buffer->content;
  5263. inputStream->end =
  5264. &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
  5265. inputPush(ctxt, inputStream);
  5266. if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
  5267. (ctxt->input->buf != NULL)) {
  5268. int base = ctxt->input->base - ctxt->input->buf->buffer->content;
  5269. int cur = ctxt->input->cur - ctxt->input->base;
  5270. xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
  5271. ctxt->input->base = ctxt->input->buf->buffer->content + base;
  5272. ctxt->input->cur = ctxt->input->base + cur;
  5273. ctxt->input->end =
  5274. &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
  5275. #ifdef DEBUG_PUSH
  5276. xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
  5277. #endif
  5278. }
  5279. ctxt->progressive = 1;
  5280. return(ctxt);
  5281. }
  5282. #endif /* LIBXML_PUSH_ENABLED */
  5283. /**
  5284. * htmlSAXParseDoc:
  5285. * @cur: a pointer to an array of xmlChar
  5286. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5287. * @sax: the SAX handler block
  5288. * @userData: if using SAX, this pointer will be provided on callbacks.
  5289. *
  5290. * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
  5291. * to handle parse events. If sax is NULL, fallback to the default DOM
  5292. * behavior and return a tree.
  5293. *
  5294. * Returns the resulting document tree unless SAX is NULL or the document is
  5295. * not well formed.
  5296. */
  5297. htmlDocPtr
  5298. htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
  5299. htmlDocPtr ret;
  5300. htmlParserCtxtPtr ctxt;
  5301. xmlInitParser();
  5302. if (cur == NULL) return(NULL);
  5303. ctxt = htmlCreateDocParserCtxt(cur, encoding);
  5304. if (ctxt == NULL) return(NULL);
  5305. if (sax != NULL) {
  5306. if (ctxt->sax != NULL) xmlFree (ctxt->sax);
  5307. ctxt->sax = sax;
  5308. ctxt->userData = userData;
  5309. }
  5310. htmlParseDocument(ctxt);
  5311. ret = ctxt->myDoc;
  5312. if (sax != NULL) {
  5313. ctxt->sax = NULL;
  5314. ctxt->userData = NULL;
  5315. }
  5316. htmlFreeParserCtxt(ctxt);
  5317. return(ret);
  5318. }
  5319. /**
  5320. * htmlParseDoc:
  5321. * @cur: a pointer to an array of xmlChar
  5322. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5323. *
  5324. * parse an HTML in-memory document and build a tree.
  5325. *
  5326. * Returns the resulting document tree
  5327. */
  5328. htmlDocPtr
  5329. htmlParseDoc(xmlChar *cur, const char *encoding) {
  5330. return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
  5331. }
  5332. /**
  5333. * htmlCreateFileParserCtxt:
  5334. * @filename: the filename
  5335. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5336. *
  5337. * Create a parser context for a file content.
  5338. * Automatic support for ZLIB/Compress compressed document is provided
  5339. * by default if found at compile-time.
  5340. *
  5341. * Returns the new parser context or NULL
  5342. */
  5343. htmlParserCtxtPtr
  5344. htmlCreateFileParserCtxt(const char *filename, const char *encoding)
  5345. {
  5346. htmlParserCtxtPtr ctxt;
  5347. htmlParserInputPtr inputStream;
  5348. char *canonicFilename;
  5349. /* htmlCharEncoding enc; */
  5350. xmlChar *content, *content_line = (xmlChar *) "charset=";
  5351. if (filename == NULL)
  5352. return(NULL);
  5353. ctxt = htmlNewParserCtxt();
  5354. if (ctxt == NULL) {
  5355. return(NULL);
  5356. }
  5357. canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
  5358. if (canonicFilename == NULL) {
  5359. #ifdef LIBXML_SAX1_ENABLED
  5360. if (xmlDefaultSAXHandler.error != NULL) {
  5361. xmlDefaultSAXHandler.error(NULL, "out of memory\n");
  5362. }
  5363. #endif
  5364. xmlFreeParserCtxt(ctxt);
  5365. return(NULL);
  5366. }
  5367. inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
  5368. xmlFree(canonicFilename);
  5369. if (inputStream == NULL) {
  5370. xmlFreeParserCtxt(ctxt);
  5371. return(NULL);
  5372. }
  5373. inputPush(ctxt, inputStream);
  5374. /* set encoding */
  5375. if (encoding) {
  5376. content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
  5377. if (content) {
  5378. strcpy ((char *)content, (char *)content_line);
  5379. strcat ((char *)content, (char *)encoding);
  5380. htmlCheckEncoding (ctxt, content);
  5381. xmlFree (content);
  5382. }
  5383. }
  5384. return(ctxt);
  5385. }
  5386. /**
  5387. * htmlSAXParseFile:
  5388. * @filename: the filename
  5389. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5390. * @sax: the SAX handler block
  5391. * @userData: if using SAX, this pointer will be provided on callbacks.
  5392. *
  5393. * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
  5394. * compressed document is provided by default if found at compile-time.
  5395. * It use the given SAX function block to handle the parsing callback.
  5396. * If sax is NULL, fallback to the default DOM tree building routines.
  5397. *
  5398. * Returns the resulting document tree unless SAX is NULL or the document is
  5399. * not well formed.
  5400. */
  5401. htmlDocPtr
  5402. htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
  5403. void *userData) {
  5404. htmlDocPtr ret;
  5405. htmlParserCtxtPtr ctxt;
  5406. htmlSAXHandlerPtr oldsax = NULL;
  5407. xmlInitParser();
  5408. ctxt = htmlCreateFileParserCtxt(filename, encoding);
  5409. if (ctxt == NULL) return(NULL);
  5410. if (sax != NULL) {
  5411. oldsax = ctxt->sax;
  5412. ctxt->sax = sax;
  5413. ctxt->userData = userData;
  5414. }
  5415. htmlParseDocument(ctxt);
  5416. ret = ctxt->myDoc;
  5417. if (sax != NULL) {
  5418. ctxt->sax = oldsax;
  5419. ctxt->userData = NULL;
  5420. }
  5421. htmlFreeParserCtxt(ctxt);
  5422. return(ret);
  5423. }
  5424. /**
  5425. * htmlParseFile:
  5426. * @filename: the filename
  5427. * @encoding: a free form C string describing the HTML document encoding, or NULL
  5428. *
  5429. * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
  5430. * compressed document is provided by default if found at compile-time.
  5431. *
  5432. * Returns the resulting document tree
  5433. */
  5434. htmlDocPtr
  5435. htmlParseFile(const char *filename, const char *encoding) {
  5436. return(htmlSAXParseFile(filename, encoding, NULL, NULL));
  5437. }
  5438. /**
  5439. * htmlHandleOmittedElem:
  5440. * @val: int 0 or 1
  5441. *
  5442. * Set and return the previous value for handling HTML omitted tags.
  5443. *
  5444. * Returns the last value for 0 for no handling, 1 for auto insertion.
  5445. */
  5446. int
  5447. htmlHandleOmittedElem(int val) {
  5448. int old = htmlOmittedDefaultValue;
  5449. htmlOmittedDefaultValue = val;
  5450. return(old);
  5451. }
  5452. /**
  5453. * htmlElementAllowedHere:
  5454. * @parent: HTML parent element
  5455. * @elt: HTML element
  5456. *
  5457. * Checks whether an HTML element may be a direct child of a parent element.
  5458. * Note - doesn't check for deprecated elements
  5459. *
  5460. * Returns 1 if allowed; 0 otherwise.
  5461. */
  5462. int
  5463. htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
  5464. const char** p ;
  5465. if ( ! elt || ! parent || ! parent->subelts )
  5466. return 0 ;
  5467. for ( p = parent->subelts; *p; ++p )
  5468. if ( !xmlStrcmp((const xmlChar *)*p, elt) )
  5469. return 1 ;
  5470. return 0 ;
  5471. }
  5472. /**
  5473. * htmlElementStatusHere:
  5474. * @parent: HTML parent element
  5475. * @elt: HTML element
  5476. *
  5477. * Checks whether an HTML element may be a direct child of a parent element.
  5478. * and if so whether it is valid or deprecated.
  5479. *
  5480. * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
  5481. */
  5482. htmlStatus
  5483. htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
  5484. if ( ! parent || ! elt )
  5485. return HTML_INVALID ;
  5486. if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
  5487. return HTML_INVALID ;
  5488. return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
  5489. }
  5490. /**
  5491. * htmlAttrAllowed:
  5492. * @elt: HTML element
  5493. * @attr: HTML attribute
  5494. * @legacy: whether to allow deprecated attributes
  5495. *
  5496. * Checks whether an attribute is valid for an element
  5497. * Has full knowledge of Required and Deprecated attributes
  5498. *
  5499. * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
  5500. */
  5501. htmlStatus
  5502. htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
  5503. const char** p ;
  5504. if ( !elt || ! attr )
  5505. return HTML_INVALID ;
  5506. if ( elt->attrs_req )
  5507. for ( p = elt->attrs_req; *p; ++p)
  5508. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  5509. return HTML_REQUIRED ;
  5510. if ( elt->attrs_opt )
  5511. for ( p = elt->attrs_opt; *p; ++p)
  5512. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  5513. return HTML_VALID ;
  5514. if ( legacy && elt->attrs_depr )
  5515. for ( p = elt->attrs_depr; *p; ++p)
  5516. if ( !xmlStrcmp((const xmlChar*)*p, attr) )
  5517. return HTML_DEPRECATED ;
  5518. return HTML_INVALID ;
  5519. }
  5520. /**
  5521. * htmlNodeStatus:
  5522. * @node: an htmlNodePtr in a tree
  5523. * @legacy: whether to allow deprecated elements (YES is faster here
  5524. * for Element nodes)
  5525. *
  5526. * Checks whether the tree node is valid. Experimental (the author
  5527. * only uses the HTML enhancements in a SAX parser)
  5528. *
  5529. * Return: for Element nodes, a return from htmlElementAllowedHere (if
  5530. * legacy allowed) or htmlElementStatusHere (otherwise).
  5531. * for Attribute nodes, a return from htmlAttrAllowed
  5532. * for other nodes, HTML_NA (no checks performed)
  5533. */
  5534. htmlStatus
  5535. htmlNodeStatus(const htmlNodePtr node, int legacy) {
  5536. if ( ! node )
  5537. return HTML_INVALID ;
  5538. switch ( node->type ) {
  5539. case XML_ELEMENT_NODE:
  5540. return legacy
  5541. ? ( htmlElementAllowedHere (
  5542. htmlTagLookup(node->parent->name) , node->name
  5543. ) ? HTML_VALID : HTML_INVALID )
  5544. : htmlElementStatusHere(
  5545. htmlTagLookup(node->parent->name) ,
  5546. htmlTagLookup(node->name) )
  5547. ;
  5548. case XML_ATTRIBUTE_NODE:
  5549. return htmlAttrAllowed(
  5550. htmlTagLookup(node->parent->name) , node->name, legacy) ;
  5551. default: return HTML_NA ;
  5552. }
  5553. }
  5554. /************************************************************************
  5555. * *
  5556. * New set (2.6.0) of simpler and more flexible APIs *
  5557. * *
  5558. ************************************************************************/
  5559. /**
  5560. * DICT_FREE:
  5561. * @str: a string
  5562. *
  5563. * Free a string if it is not owned by the "dict" dictionnary in the
  5564. * current scope
  5565. */
  5566. #define DICT_FREE(str) \
  5567. if ((str) && ((!dict) || \
  5568. (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
  5569. xmlFree((char *)(str));
  5570. /**
  5571. * htmlCtxtReset:
  5572. * @ctxt: an HTML parser context
  5573. *
  5574. * Reset a parser context
  5575. */
  5576. void
  5577. htmlCtxtReset(htmlParserCtxtPtr ctxt)
  5578. {
  5579. xmlParserInputPtr input;
  5580. xmlDictPtr dict;
  5581. if (ctxt == NULL)
  5582. return;
  5583. xmlInitParser();
  5584. dict = ctxt->dict;
  5585. while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
  5586. xmlFreeInputStream(input);
  5587. }
  5588. ctxt->inputNr = 0;
  5589. ctxt->input = NULL;
  5590. ctxt->spaceNr = 0;
  5591. if (ctxt->spaceTab != NULL) {
  5592. ctxt->spaceTab[0] = -1;
  5593. ctxt->space = &ctxt->spaceTab[0];
  5594. } else {
  5595. ctxt->space = NULL;
  5596. }
  5597. ctxt->nodeNr = 0;
  5598. ctxt->node = NULL;
  5599. ctxt->nameNr = 0;
  5600. ctxt->name = NULL;
  5601. DICT_FREE(ctxt->version);
  5602. ctxt->version = NULL;
  5603. DICT_FREE(ctxt->encoding);
  5604. ctxt->encoding = NULL;
  5605. DICT_FREE(ctxt->directory);
  5606. ctxt->directory = NULL;
  5607. DICT_FREE(ctxt->extSubURI);
  5608. ctxt->extSubURI = NULL;
  5609. DICT_FREE(ctxt->extSubSystem);
  5610. ctxt->extSubSystem = NULL;
  5611. if (ctxt->myDoc != NULL)
  5612. xmlFreeDoc(ctxt->myDoc);
  5613. ctxt->myDoc = NULL;
  5614. ctxt->standalone = -1;
  5615. ctxt->hasExternalSubset = 0;
  5616. ctxt->hasPErefs = 0;
  5617. ctxt->html = 1;
  5618. ctxt->external = 0;
  5619. ctxt->instate = XML_PARSER_START;
  5620. ctxt->token = 0;
  5621. ctxt->wellFormed = 1;
  5622. ctxt->nsWellFormed = 1;
  5623. ctxt->valid = 1;
  5624. ctxt->vctxt.userData = ctxt;
  5625. ctxt->vctxt.error = xmlParserValidityError;
  5626. ctxt->vctxt.warning = xmlParserValidityWarning;
  5627. ctxt->record_info = 0;
  5628. ctxt->nbChars = 0;
  5629. ctxt->checkIndex = 0;
  5630. ctxt->inSubset = 0;
  5631. ctxt->errNo = XML_ERR_OK;
  5632. ctxt->depth = 0;
  5633. ctxt->charset = XML_CHAR_ENCODING_NONE;
  5634. ctxt->catalogs = NULL;
  5635. xmlInitNodeInfoSeq(&ctxt->node_seq);
  5636. if (ctxt->attsDefault != NULL) {
  5637. xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
  5638. ctxt->attsDefault = NULL;
  5639. }
  5640. if (ctxt->attsSpecial != NULL) {
  5641. xmlHashFree(ctxt->attsSpecial, NULL);
  5642. ctxt->attsSpecial = NULL;
  5643. }
  5644. }
  5645. /**
  5646. * htmlCtxtUseOptions:
  5647. * @ctxt: an HTML parser context
  5648. * @options: a combination of htmlParserOption(s)
  5649. *
  5650. * Applies the options to the parser context
  5651. *
  5652. * Returns 0 in case of success, the set of unknown or unimplemented options
  5653. * in case of error.
  5654. */
  5655. int
  5656. htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
  5657. {
  5658. if (ctxt == NULL)
  5659. return(-1);
  5660. if (options & HTML_PARSE_NOWARNING) {
  5661. ctxt->sax->warning = NULL;
  5662. ctxt->vctxt.warning = NULL;
  5663. options -= XML_PARSE_NOWARNING;
  5664. ctxt->options |= XML_PARSE_NOWARNING;
  5665. }
  5666. if (options & HTML_PARSE_NOERROR) {
  5667. ctxt->sax->error = NULL;
  5668. ctxt->vctxt.error = NULL;
  5669. ctxt->sax->fatalError = NULL;
  5670. options -= XML_PARSE_NOERROR;
  5671. ctxt->options |= XML_PARSE_NOERROR;
  5672. }
  5673. if (options & HTML_PARSE_PEDANTIC) {
  5674. ctxt->pedantic = 1;
  5675. options -= XML_PARSE_PEDANTIC;
  5676. ctxt->options |= XML_PARSE_PEDANTIC;
  5677. } else
  5678. ctxt->pedantic = 0;
  5679. if (options & XML_PARSE_NOBLANKS) {
  5680. ctxt->keepBlanks = 0;
  5681. ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
  5682. options -= XML_PARSE_NOBLANKS;
  5683. ctxt->options |= XML_PARSE_NOBLANKS;
  5684. } else
  5685. ctxt->keepBlanks = 1;
  5686. if (options & HTML_PARSE_RECOVER) {
  5687. ctxt->recovery = 1;
  5688. options -= HTML_PARSE_RECOVER;
  5689. } else
  5690. ctxt->recovery = 0;
  5691. if (options & HTML_PARSE_COMPACT) {
  5692. ctxt->options |= HTML_PARSE_COMPACT;
  5693. options -= HTML_PARSE_COMPACT;
  5694. }
  5695. if (options & XML_PARSE_HUGE) {
  5696. ctxt->options |= XML_PARSE_HUGE;
  5697. options -= XML_PARSE_HUGE;
  5698. }
  5699. ctxt->dictNames = 0;
  5700. return (options);
  5701. }
  5702. /**
  5703. * htmlDoRead:
  5704. * @ctxt: an HTML parser context
  5705. * @URL: the base URL to use for the document
  5706. * @encoding: the document encoding, or NULL
  5707. * @options: a combination of htmlParserOption(s)
  5708. * @reuse: keep the context for reuse
  5709. *
  5710. * Common front-end for the htmlRead functions
  5711. *
  5712. * Returns the resulting document tree or NULL
  5713. */
  5714. static htmlDocPtr
  5715. htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
  5716. int options, int reuse)
  5717. {
  5718. htmlDocPtr ret;
  5719. htmlCtxtUseOptions(ctxt, options);
  5720. ctxt->html = 1;
  5721. if (encoding != NULL) {
  5722. xmlCharEncodingHandlerPtr hdlr;
  5723. hdlr = xmlFindCharEncodingHandler(encoding);
  5724. if (hdlr != NULL) {
  5725. xmlSwitchToEncoding(ctxt, hdlr);
  5726. if (ctxt->input->encoding != NULL)
  5727. xmlFree((xmlChar *) ctxt->input->encoding);
  5728. ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
  5729. }
  5730. }
  5731. if ((URL != NULL) && (ctxt->input != NULL) &&
  5732. (ctxt->input->filename == NULL))
  5733. ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
  5734. htmlParseDocument(ctxt);
  5735. ret = ctxt->myDoc;
  5736. ctxt->myDoc = NULL;
  5737. if (!reuse) {
  5738. if ((ctxt->dictNames) &&
  5739. (ret != NULL) &&
  5740. (ret->dict == ctxt->dict))
  5741. ctxt->dict = NULL;
  5742. xmlFreeParserCtxt(ctxt);
  5743. }
  5744. return (ret);
  5745. }
  5746. /**
  5747. * htmlReadDoc:
  5748. * @cur: a pointer to a zero terminated string
  5749. * @URL: the base URL to use for the document
  5750. * @encoding: the document encoding, or NULL
  5751. * @options: a combination of htmlParserOption(s)
  5752. *
  5753. * parse an XML in-memory document and build a tree.
  5754. *
  5755. * Returns the resulting document tree
  5756. */
  5757. htmlDocPtr
  5758. htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
  5759. {
  5760. htmlParserCtxtPtr ctxt;
  5761. if (cur == NULL)
  5762. return (NULL);
  5763. xmlInitParser();
  5764. ctxt = htmlCreateDocParserCtxt(cur, NULL);
  5765. if (ctxt == NULL)
  5766. return (NULL);
  5767. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  5768. }
  5769. /**
  5770. * htmlReadFile:
  5771. * @filename: a file or URL
  5772. * @encoding: the document encoding, or NULL
  5773. * @options: a combination of htmlParserOption(s)
  5774. *
  5775. * parse an XML file from the filesystem or the network.
  5776. *
  5777. * Returns the resulting document tree
  5778. */
  5779. htmlDocPtr
  5780. htmlReadFile(const char *filename, const char *encoding, int options)
  5781. {
  5782. htmlParserCtxtPtr ctxt;
  5783. xmlInitParser();
  5784. ctxt = htmlCreateFileParserCtxt(filename, encoding);
  5785. if (ctxt == NULL)
  5786. return (NULL);
  5787. return (htmlDoRead(ctxt, NULL, NULL, options, 0));
  5788. }
  5789. /**
  5790. * htmlReadMemory:
  5791. * @buffer: a pointer to a char array
  5792. * @size: the size of the array
  5793. * @URL: the base URL to use for the document
  5794. * @encoding: the document encoding, or NULL
  5795. * @options: a combination of htmlParserOption(s)
  5796. *
  5797. * parse an XML in-memory document and build a tree.
  5798. *
  5799. * Returns the resulting document tree
  5800. */
  5801. htmlDocPtr
  5802. htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
  5803. {
  5804. htmlParserCtxtPtr ctxt;
  5805. xmlInitParser();
  5806. ctxt = xmlCreateMemoryParserCtxt(buffer, size);
  5807. if (ctxt == NULL)
  5808. return (NULL);
  5809. htmlDefaultSAXHandlerInit();
  5810. if (ctxt->sax != NULL)
  5811. memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
  5812. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  5813. }
  5814. /**
  5815. * htmlReadFd:
  5816. * @fd: an open file descriptor
  5817. * @URL: the base URL to use for the document
  5818. * @encoding: the document encoding, or NULL
  5819. * @options: a combination of htmlParserOption(s)
  5820. *
  5821. * parse an XML from a file descriptor and build a tree.
  5822. *
  5823. * Returns the resulting document tree
  5824. */
  5825. htmlDocPtr
  5826. htmlReadFd(int fd, const char *URL, const char *encoding, int options)
  5827. {
  5828. htmlParserCtxtPtr ctxt;
  5829. xmlParserInputBufferPtr input;
  5830. xmlParserInputPtr stream;
  5831. if (fd < 0)
  5832. return (NULL);
  5833. xmlInitParser();
  5834. input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
  5835. if (input == NULL)
  5836. return (NULL);
  5837. ctxt = xmlNewParserCtxt();
  5838. if (ctxt == NULL) {
  5839. xmlFreeParserInputBuffer(input);
  5840. return (NULL);
  5841. }
  5842. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  5843. if (stream == NULL) {
  5844. xmlFreeParserInputBuffer(input);
  5845. xmlFreeParserCtxt(ctxt);
  5846. return (NULL);
  5847. }
  5848. inputPush(ctxt, stream);
  5849. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  5850. }
  5851. /**
  5852. * htmlReadIO:
  5853. * @ioread: an I/O read function
  5854. * @ioclose: an I/O close function
  5855. * @ioctx: an I/O handler
  5856. * @URL: the base URL to use for the document
  5857. * @encoding: the document encoding, or NULL
  5858. * @options: a combination of htmlParserOption(s)
  5859. *
  5860. * parse an HTML document from I/O functions and source and build a tree.
  5861. *
  5862. * Returns the resulting document tree
  5863. */
  5864. htmlDocPtr
  5865. htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
  5866. void *ioctx, const char *URL, const char *encoding, int options)
  5867. {
  5868. htmlParserCtxtPtr ctxt;
  5869. xmlParserInputBufferPtr input;
  5870. xmlParserInputPtr stream;
  5871. if (ioread == NULL)
  5872. return (NULL);
  5873. xmlInitParser();
  5874. input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
  5875. XML_CHAR_ENCODING_NONE);
  5876. if (input == NULL)
  5877. return (NULL);
  5878. ctxt = htmlNewParserCtxt();
  5879. if (ctxt == NULL) {
  5880. xmlFreeParserInputBuffer(input);
  5881. return (NULL);
  5882. }
  5883. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  5884. if (stream == NULL) {
  5885. xmlFreeParserInputBuffer(input);
  5886. xmlFreeParserCtxt(ctxt);
  5887. return (NULL);
  5888. }
  5889. inputPush(ctxt, stream);
  5890. return (htmlDoRead(ctxt, URL, encoding, options, 0));
  5891. }
  5892. /**
  5893. * htmlCtxtReadDoc:
  5894. * @ctxt: an HTML parser context
  5895. * @cur: a pointer to a zero terminated string
  5896. * @URL: the base URL to use for the document
  5897. * @encoding: the document encoding, or NULL
  5898. * @options: a combination of htmlParserOption(s)
  5899. *
  5900. * parse an XML in-memory document and build a tree.
  5901. * This reuses the existing @ctxt parser context
  5902. *
  5903. * Returns the resulting document tree
  5904. */
  5905. htmlDocPtr
  5906. htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
  5907. const char *URL, const char *encoding, int options)
  5908. {
  5909. xmlParserInputPtr stream;
  5910. if (cur == NULL)
  5911. return (NULL);
  5912. if (ctxt == NULL)
  5913. return (NULL);
  5914. htmlCtxtReset(ctxt);
  5915. stream = xmlNewStringInputStream(ctxt, cur);
  5916. if (stream == NULL) {
  5917. return (NULL);
  5918. }
  5919. inputPush(ctxt, stream);
  5920. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  5921. }
  5922. /**
  5923. * htmlCtxtReadFile:
  5924. * @ctxt: an HTML parser context
  5925. * @filename: a file or URL
  5926. * @encoding: the document encoding, or NULL
  5927. * @options: a combination of htmlParserOption(s)
  5928. *
  5929. * parse an XML file from the filesystem or the network.
  5930. * This reuses the existing @ctxt parser context
  5931. *
  5932. * Returns the resulting document tree
  5933. */
  5934. htmlDocPtr
  5935. htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
  5936. const char *encoding, int options)
  5937. {
  5938. xmlParserInputPtr stream;
  5939. if (filename == NULL)
  5940. return (NULL);
  5941. if (ctxt == NULL)
  5942. return (NULL);
  5943. htmlCtxtReset(ctxt);
  5944. stream = xmlLoadExternalEntity(filename, NULL, ctxt);
  5945. if (stream == NULL) {
  5946. return (NULL);
  5947. }
  5948. inputPush(ctxt, stream);
  5949. return (htmlDoRead(ctxt, NULL, encoding, options, 1));
  5950. }
  5951. /**
  5952. * htmlCtxtReadMemory:
  5953. * @ctxt: an HTML parser context
  5954. * @buffer: a pointer to a char array
  5955. * @size: the size of the array
  5956. * @URL: the base URL to use for the document
  5957. * @encoding: the document encoding, or NULL
  5958. * @options: a combination of htmlParserOption(s)
  5959. *
  5960. * parse an XML in-memory document and build a tree.
  5961. * This reuses the existing @ctxt parser context
  5962. *
  5963. * Returns the resulting document tree
  5964. */
  5965. htmlDocPtr
  5966. htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
  5967. const char *URL, const char *encoding, int options)
  5968. {
  5969. xmlParserInputBufferPtr input;
  5970. xmlParserInputPtr stream;
  5971. if (ctxt == NULL)
  5972. return (NULL);
  5973. if (buffer == NULL)
  5974. return (NULL);
  5975. htmlCtxtReset(ctxt);
  5976. input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
  5977. if (input == NULL) {
  5978. return(NULL);
  5979. }
  5980. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  5981. if (stream == NULL) {
  5982. xmlFreeParserInputBuffer(input);
  5983. return(NULL);
  5984. }
  5985. inputPush(ctxt, stream);
  5986. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  5987. }
  5988. /**
  5989. * htmlCtxtReadFd:
  5990. * @ctxt: an HTML parser context
  5991. * @fd: an open file descriptor
  5992. * @URL: the base URL to use for the document
  5993. * @encoding: the document encoding, or NULL
  5994. * @options: a combination of htmlParserOption(s)
  5995. *
  5996. * parse an XML from a file descriptor and build a tree.
  5997. * This reuses the existing @ctxt parser context
  5998. *
  5999. * Returns the resulting document tree
  6000. */
  6001. htmlDocPtr
  6002. htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
  6003. const char *URL, const char *encoding, int options)
  6004. {
  6005. xmlParserInputBufferPtr input;
  6006. xmlParserInputPtr stream;
  6007. if (fd < 0)
  6008. return (NULL);
  6009. if (ctxt == NULL)
  6010. return (NULL);
  6011. htmlCtxtReset(ctxt);
  6012. input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
  6013. if (input == NULL)
  6014. return (NULL);
  6015. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6016. if (stream == NULL) {
  6017. xmlFreeParserInputBuffer(input);
  6018. return (NULL);
  6019. }
  6020. inputPush(ctxt, stream);
  6021. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6022. }
  6023. /**
  6024. * htmlCtxtReadIO:
  6025. * @ctxt: an HTML parser context
  6026. * @ioread: an I/O read function
  6027. * @ioclose: an I/O close function
  6028. * @ioctx: an I/O handler
  6029. * @URL: the base URL to use for the document
  6030. * @encoding: the document encoding, or NULL
  6031. * @options: a combination of htmlParserOption(s)
  6032. *
  6033. * parse an HTML document from I/O functions and source and build a tree.
  6034. * This reuses the existing @ctxt parser context
  6035. *
  6036. * Returns the resulting document tree
  6037. */
  6038. htmlDocPtr
  6039. htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
  6040. xmlInputCloseCallback ioclose, void *ioctx,
  6041. const char *URL,
  6042. const char *encoding, int options)
  6043. {
  6044. xmlParserInputBufferPtr input;
  6045. xmlParserInputPtr stream;
  6046. if (ioread == NULL)
  6047. return (NULL);
  6048. if (ctxt == NULL)
  6049. return (NULL);
  6050. htmlCtxtReset(ctxt);
  6051. input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
  6052. XML_CHAR_ENCODING_NONE);
  6053. if (input == NULL)
  6054. return (NULL);
  6055. stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
  6056. if (stream == NULL) {
  6057. xmlFreeParserInputBuffer(input);
  6058. return (NULL);
  6059. }
  6060. inputPush(ctxt, stream);
  6061. return (htmlDoRead(ctxt, URL, encoding, options, 1));
  6062. }
  6063. #define bottom_HTMLparser
  6064. #include "elfgcchack.h"
  6065. #endif /* LIBXML_HTML_ENABLED */