123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880 |
- #include "libxml.h"
- #ifdef LIBXML_HTML_ENABLED
- #include <string.h>
- #include <stdarg.h>
- #ifdef HAVE_SYS_TYPES_H
- #include <sys/types.h>
- #endif
- #ifdef HAVE_SYS_STAT_H
- #include <sys/stat.h>
- #endif
- #ifdef HAVE_FCNTL_H
- #include <fcntl.h>
- #endif
- #ifdef HAVE_UNISTD_H
- #include <unistd.h>
- #endif
- #ifdef HAVE_STDLIB_H
- #include <stdlib.h>
- #endif
- #include <libxml/xmlmemory.h>
- #include <libxml/HTMLparser.h>
- #include <libxml/HTMLtree.h>
- #include <libxml/debugXML.h>
- #include <libxml/xmlerror.h>
- #include <libxml/globals.h>
- #ifdef LIBXML_DEBUG_ENABLED
- static int debug = 0;
- #endif
- static int copy = 0;
- static int sax = 0;
- static int repeat = 0;
- static int noout = 0;
- #ifdef LIBXML_PUSH_ENABLED
- static int push = 0;
- #endif
- static char *encoding = NULL;
- static int options = 0;
- static xmlSAXHandler emptySAXHandlerStruct = {
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- 1,
- NULL,
- NULL,
- NULL,
- NULL
- };
- static xmlSAXHandlerPtr emptySAXHandler = &emptySAXHandlerStruct;
- extern xmlSAXHandlerPtr debugSAXHandler;
- static int
- isStandaloneDebug(void *ctx ATTRIBUTE_UNUSED)
- {
- fprintf(stdout, "SAX.isStandalone()\n");
- return(0);
- }
- static int
- hasInternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED)
- {
- fprintf(stdout, "SAX.hasInternalSubset()\n");
- return(0);
- }
- static int
- hasExternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED)
- {
- fprintf(stdout, "SAX.hasExternalSubset()\n");
- return(0);
- }
- static void
- internalSubsetDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
- const xmlChar *ExternalID, const xmlChar *SystemID)
- {
- fprintf(stdout, "SAX.internalSubset(%s,", name);
- if (ExternalID == NULL)
- fprintf(stdout, " ,");
- else
- fprintf(stdout, " %s,", ExternalID);
- if (SystemID == NULL)
- fprintf(stdout, " )\n");
- else
- fprintf(stdout, " %s)\n", SystemID);
- }
- static xmlParserInputPtr
- resolveEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *publicId, const xmlChar *systemId)
- {
-
-
- fprintf(stdout, "SAX.resolveEntity(");
- if (publicId != NULL)
- fprintf(stdout, "%s", (char *)publicId);
- else
- fprintf(stdout, " ");
- if (systemId != NULL)
- fprintf(stdout, ", %s)\n", (char *)systemId);
- else
- fprintf(stdout, ", )\n");
- return(NULL);
- }
- static xmlEntityPtr
- getEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
- {
- fprintf(stdout, "SAX.getEntity(%s)\n", name);
- return(NULL);
- }
- static xmlEntityPtr
- getParameterEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
- {
- fprintf(stdout, "SAX.getParameterEntity(%s)\n", name);
- return(NULL);
- }
- static void
- entityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type,
- const xmlChar *publicId, const xmlChar *systemId, xmlChar *content)
- {
- fprintf(stdout, "SAX.entityDecl(%s, %d, %s, %s, %s)\n",
- name, type, publicId, systemId, content);
- }
- static void
- attributeDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *elem, const xmlChar *name,
- int type, int def, const xmlChar *defaultValue,
- xmlEnumerationPtr tree ATTRIBUTE_UNUSED)
- {
- fprintf(stdout, "SAX.attributeDecl(%s, %s, %d, %d, %s, ...)\n",
- elem, name, type, def, defaultValue);
- }
- static void
- elementDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type,
- xmlElementContentPtr content ATTRIBUTE_UNUSED)
- {
- fprintf(stdout, "SAX.elementDecl(%s, %d, ...)\n",
- name, type);
- }
- static void
- notationDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
- const xmlChar *publicId, const xmlChar *systemId)
- {
- fprintf(stdout, "SAX.notationDecl(%s, %s, %s)\n",
- (char *) name, (char *) publicId, (char *) systemId);
- }
- static void
- unparsedEntityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name,
- const xmlChar *publicId, const xmlChar *systemId,
- const xmlChar *notationName)
- {
- fprintf(stdout, "SAX.unparsedEntityDecl(%s, %s, %s, %s)\n",
- (char *) name, (char *) publicId, (char *) systemId,
- (char *) notationName);
- }
- static void
- setDocumentLocatorDebug(void *ctx ATTRIBUTE_UNUSED, xmlSAXLocatorPtr loc ATTRIBUTE_UNUSED)
- {
- fprintf(stdout, "SAX.setDocumentLocator()\n");
- }
- static void
- startDocumentDebug(void *ctx ATTRIBUTE_UNUSED)
- {
- fprintf(stdout, "SAX.startDocument()\n");
- }
- static void
- endDocumentDebug(void *ctx ATTRIBUTE_UNUSED)
- {
- fprintf(stdout, "SAX.endDocument()\n");
- }
- static void
- startElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar **atts)
- {
- int i;
- fprintf(stdout, "SAX.startElement(%s", (char *) name);
- if (atts != NULL) {
- for (i = 0;(atts[i] != NULL);i++) {
- fprintf(stdout, ", %s", atts[i++]);
- if (atts[i] != NULL) {
- unsigned char output[40];
- const unsigned char *att = atts[i];
- int outlen, attlen;
- fprintf(stdout, "='");
- while ((attlen = strlen((char*)att)) > 0) {
- outlen = sizeof output - 1;
- htmlEncodeEntities(output, &outlen, att, &attlen, '\'');
- output[outlen] = 0;
- fprintf(stdout, "%s", (char *) output);
- att += attlen;
- }
- fprintf(stdout, "'");
- }
- }
- }
- fprintf(stdout, ")\n");
- }
- static void
- endElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
- {
- fprintf(stdout, "SAX.endElement(%s)\n", (char *) name);
- }
- static void
- charactersDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
- {
- unsigned char output[40];
- int inlen = len, outlen = 30;
- htmlEncodeEntities(output, &outlen, ch, &inlen, 0);
- output[outlen] = 0;
- fprintf(stdout, "SAX.characters(%s, %d)\n", output, len);
- }
- static void
- cdataDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
- {
- unsigned char output[40];
- int inlen = len, outlen = 30;
- htmlEncodeEntities(output, &outlen, ch, &inlen, 0);
- output[outlen] = 0;
- fprintf(stdout, "SAX.cdata(%s, %d)\n", output, len);
- }
- static void
- referenceDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
- {
- fprintf(stdout, "SAX.reference(%s)\n", name);
- }
- static void
- ignorableWhitespaceDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
- {
- char output[40];
- int i;
- for (i = 0;(i<len) && (i < 30);i++)
- output[i] = ch[i];
- output[i] = 0;
- fprintf(stdout, "SAX.ignorableWhitespace(%s, %d)\n", output, len);
- }
- static void
- processingInstructionDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *target,
- const xmlChar *data)
- {
- fprintf(stdout, "SAX.processingInstruction(%s, %s)\n",
- (char *) target, (char *) data);
- }
- static void
- commentDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *value)
- {
- fprintf(stdout, "SAX.comment(%s)\n", value);
- }
- static void XMLCDECL
- warningDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
- {
- va_list args;
- va_start(args, msg);
- fprintf(stdout, "SAX.warning: ");
- vfprintf(stdout, msg, args);
- va_end(args);
- }
- static void XMLCDECL
- errorDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
- {
- va_list args;
- va_start(args, msg);
- fprintf(stdout, "SAX.error: ");
- vfprintf(stdout, msg, args);
- va_end(args);
- }
- static void XMLCDECL
- fatalErrorDebug(void *ctx ATTRIBUTE_UNUSED, const char *msg, ...)
- {
- va_list args;
- va_start(args, msg);
- fprintf(stdout, "SAX.fatalError: ");
- vfprintf(stdout, msg, args);
- va_end(args);
- }
- static xmlSAXHandler debugSAXHandlerStruct = {
- internalSubsetDebug,
- isStandaloneDebug,
- hasInternalSubsetDebug,
- hasExternalSubsetDebug,
- resolveEntityDebug,
- getEntityDebug,
- entityDeclDebug,
- notationDeclDebug,
- attributeDeclDebug,
- elementDeclDebug,
- unparsedEntityDeclDebug,
- setDocumentLocatorDebug,
- startDocumentDebug,
- endDocumentDebug,
- startElementDebug,
- endElementDebug,
- referenceDebug,
- charactersDebug,
- ignorableWhitespaceDebug,
- processingInstructionDebug,
- commentDebug,
- warningDebug,
- errorDebug,
- fatalErrorDebug,
- getParameterEntityDebug,
- cdataDebug,
- NULL,
- 1,
- NULL,
- NULL,
- NULL,
- NULL
- };
- xmlSAXHandlerPtr debugSAXHandler = &debugSAXHandlerStruct;
- static void
- parseSAXFile(char *filename) {
- htmlDocPtr doc = NULL;
-
- #ifdef LIBXML_PUSH_ENABLED
- if (push) {
- FILE *f;
- #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
- f = fopen(filename, "rb");
- #else
- f = fopen(filename, "r");
- #endif
- if (f != NULL) {
- int res, size = 3;
- char chars[4096];
- htmlParserCtxtPtr ctxt;
-
- size = 4096;
- res = fread(chars, 1, 4, f);
- if (res > 0) {
- ctxt = htmlCreatePushParserCtxt(emptySAXHandler, NULL,
- chars, res, filename, XML_CHAR_ENCODING_NONE);
- while ((res = fread(chars, 1, size, f)) > 0) {
- htmlParseChunk(ctxt, chars, res, 0);
- }
- htmlParseChunk(ctxt, chars, 0, 1);
- doc = ctxt->myDoc;
- htmlFreeParserCtxt(ctxt);
- }
- if (doc != NULL) {
- fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
- xmlFreeDoc(doc);
- }
- fclose(f);
- }
- if (!noout) {
- #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
- f = fopen(filename, "rb");
- #else
- f = fopen(filename, "r");
- #endif
- if (f != NULL) {
- int res, size = 3;
- char chars[4096];
- htmlParserCtxtPtr ctxt;
-
- size = 4096;
- res = fread(chars, 1, 4, f);
- if (res > 0) {
- ctxt = htmlCreatePushParserCtxt(debugSAXHandler, NULL,
- chars, res, filename, XML_CHAR_ENCODING_NONE);
- while ((res = fread(chars, 1, size, f)) > 0) {
- htmlParseChunk(ctxt, chars, res, 0);
- }
- htmlParseChunk(ctxt, chars, 0, 1);
- doc = ctxt->myDoc;
- htmlFreeParserCtxt(ctxt);
- }
- if (doc != NULL) {
- fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
- xmlFreeDoc(doc);
- }
- fclose(f);
- }
- }
- } else {
- #endif
- doc = htmlSAXParseFile(filename, NULL, emptySAXHandler, NULL);
- if (doc != NULL) {
- fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
- xmlFreeDoc(doc);
- }
- if (!noout) {
-
- doc = htmlSAXParseFile(filename, NULL, debugSAXHandler, NULL);
- if (doc != NULL) {
- fprintf(stdout, "htmlSAXParseFile returned non-NULL\n");
- xmlFreeDoc(doc);
- }
- }
- #ifdef LIBXML_PUSH_ENABLED
- }
- #endif
- }
- static void
- parseAndPrintFile(char *filename) {
- htmlDocPtr doc = NULL;
-
- #ifdef LIBXML_PUSH_ENABLED
- if (push) {
- FILE *f;
- #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
- f = fopen(filename, "rb");
- #else
- f = fopen(filename, "r");
- #endif
- if (f != NULL) {
- int res, size = 3;
- char chars[4096];
- htmlParserCtxtPtr ctxt;
-
- size = 4096;
- res = fread(chars, 1, 4, f);
- if (res > 0) {
- ctxt = htmlCreatePushParserCtxt(NULL, NULL,
- chars, res, filename, XML_CHAR_ENCODING_NONE);
- while ((res = fread(chars, 1, size, f)) > 0) {
- htmlParseChunk(ctxt, chars, res, 0);
- }
- htmlParseChunk(ctxt, chars, 0, 1);
- doc = ctxt->myDoc;
- htmlFreeParserCtxt(ctxt);
- }
- fclose(f);
- }
- } else {
- doc = htmlReadFile(filename, NULL, options);
- }
- #else
- doc = htmlReadFile(filename,NULL,options);
- #endif
- if (doc == NULL) {
- xmlGenericError(xmlGenericErrorContext,
- "Could not parse %s\n", filename);
- }
- #ifdef LIBXML_TREE_ENABLED
-
- if (copy) {
- htmlDocPtr tmp;
- tmp = doc;
- doc = xmlCopyDoc(doc, 1);
- xmlFreeDoc(tmp);
- }
- #endif
- #ifdef LIBXML_OUTPUT_ENABLED
-
- if (!noout) {
- #ifdef LIBXML_DEBUG_ENABLED
- if (!debug) {
- if (encoding)
- htmlSaveFileEnc("-", doc, encoding);
- else
- htmlDocDump(stdout, doc);
- } else
- xmlDebugDumpDocument(stdout, doc);
- #else
- if (encoding)
- htmlSaveFileEnc("-", doc, encoding);
- else
- htmlDocDump(stdout, doc);
- #endif
- }
- #endif
-
- xmlFreeDoc(doc);
- }
- int main(int argc, char **argv) {
- int i, count;
- int files = 0;
- for (i = 1; i < argc ; i++) {
- #ifdef LIBXML_DEBUG_ENABLED
- if ((!strcmp(argv[i], "-debug")) || (!strcmp(argv[i], "--debug")))
- debug++;
- else
- #endif
- if ((!strcmp(argv[i], "-copy")) || (!strcmp(argv[i], "--copy")))
- copy++;
- #ifdef LIBXML_PUSH_ENABLED
- else if ((!strcmp(argv[i], "-push")) || (!strcmp(argv[i], "--push")))
- push++;
- #endif
- else if ((!strcmp(argv[i], "-sax")) || (!strcmp(argv[i], "--sax")))
- sax++;
- else if ((!strcmp(argv[i], "-noout")) || (!strcmp(argv[i], "--noout")))
- noout++;
- else if ((!strcmp(argv[i], "-repeat")) ||
- (!strcmp(argv[i], "--repeat")))
- repeat++;
- else if ((!strcmp(argv[i], "-encode")) ||
- (!strcmp(argv[i], "--encode"))) {
- i++;
- encoding = argv[i];
- }
- }
- for (i = 1; i < argc ; i++) {
- if ((!strcmp(argv[i], "-encode")) ||
- (!strcmp(argv[i], "--encode"))) {
- i++;
- continue;
- }
- if (argv[i][0] != '-') {
- if (repeat) {
- for (count = 0;count < 100 * repeat;count++) {
- if (sax)
- parseSAXFile(argv[i]);
- else
- parseAndPrintFile(argv[i]);
- }
- } else {
- if (sax)
- parseSAXFile(argv[i]);
- else
- parseAndPrintFile(argv[i]);
- }
- files ++;
- }
- }
- if (files == 0) {
- printf("Usage : %s [--debug] [--copy] [--copy] HTMLfiles ...\n",
- argv[0]);
- printf("\tParse the HTML files and output the result of the parsing\n");
- #ifdef LIBXML_DEBUG_ENABLED
- printf("\t--debug : dump a debug tree of the in-memory document\n");
- #endif
- printf("\t--copy : used to test the internal copy implementation\n");
- printf("\t--sax : debug the sequence of SAX callbacks\n");
- printf("\t--repeat : parse the file 100 times, for timing\n");
- printf("\t--noout : do not print the result\n");
- #ifdef LIBXML_PUSH_ENABLED
- printf("\t--push : use the push mode parser\n");
- #endif
- printf("\t--encode encoding : output in the given encoding\n");
- }
- xmlCleanupParser();
- xmlMemoryDump();
- return(0);
- }
- #else
- #include <stdio.h>
- int main(int argc ATTRIBUTE_UNUSED, char **argv ATTRIBUTE_UNUSED) {
- printf("%s : HTML support not compiled in\n", argv[0]);
- return(0);
- }
- #endif
|