Eneboo - Documentación para desarrolladores
|
00001 /* 00002 * Summary: interface for an HTML 4.0 non-verifying parser 00003 * Description: this module implements an HTML 4.0 non-verifying parser 00004 * with API compatible with the XML parser ones. It should 00005 * be able to parse "real world" HTML, even if severely 00006 * broken from a specification point of view. 00007 * 00008 * Copy: See Copyright for the status of this software. 00009 * 00010 * Author: Daniel Veillard 00011 */ 00012 00013 #ifndef __HTML_PARSER_H__ 00014 #define __HTML_PARSER_H__ 00015 #include <libxml/xmlversion.h> 00016 #include <libxml/parser.h> 00017 00018 #ifdef LIBXML_HTML_ENABLED 00019 00020 #ifdef __cplusplus 00021 extern "C" { 00022 #endif 00023 00024 /* 00025 * Most of the back-end structures from XML and HTML are shared. 00026 */ 00027 typedef xmlParserCtxt htmlParserCtxt; 00028 typedef xmlParserCtxtPtr htmlParserCtxtPtr; 00029 typedef xmlParserNodeInfo htmlParserNodeInfo; 00030 typedef xmlSAXHandler htmlSAXHandler; 00031 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; 00032 typedef xmlParserInput htmlParserInput; 00033 typedef xmlParserInputPtr htmlParserInputPtr; 00034 typedef xmlDocPtr htmlDocPtr; 00035 typedef xmlNodePtr htmlNodePtr; 00036 00037 /* 00038 * Internal description of an HTML element, representing HTML 4.01 00039 * and XHTML 1.0 (which share the same structure). 00040 */ 00041 typedef struct _htmlElemDesc htmlElemDesc; 00042 typedef htmlElemDesc *htmlElemDescPtr; 00043 struct _htmlElemDesc { 00044 const char *name; /* The tag name */ 00045 char startTag; /* Whether the start tag can be implied */ 00046 char endTag; /* Whether the end tag can be implied */ 00047 char saveEndTag; /* Whether the end tag should be saved */ 00048 char empty; /* Is this an empty element ? */ 00049 char depr; /* Is this a deprecated element ? */ 00050 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ 00051 char isinline; /* is this a block 0 or inline 1 element */ 00052 const char *desc; /* the description */ 00053 00054 /* NRK Jan.2003 00055 * New fields encapsulating HTML structure 00056 * 00057 * Bugs: 00058 * This is a very limited representation. It fails to tell us when 00059 * an element *requires* subelements (we only have whether they're 00060 * allowed or not), and it doesn't tell us where CDATA and PCDATA 00061 * are allowed. Some element relationships are not fully represented: 00062 * these are flagged with the word MODIFIER 00063 */ 00064 const char** subelts; /* allowed sub-elements of this element */ 00065 const char* defaultsubelt; /* subelement for suggested auto-repair 00066 if necessary or NULL */ 00067 const char** attrs_opt; /* Optional Attributes */ 00068 const char** attrs_depr; /* Additional deprecated attributes */ 00069 const char** attrs_req; /* Required attributes */ 00070 }; 00071 00072 /* 00073 * Internal description of an HTML entity. 00074 */ 00075 typedef struct _htmlEntityDesc htmlEntityDesc; 00076 typedef htmlEntityDesc *htmlEntityDescPtr; 00077 struct _htmlEntityDesc { 00078 unsigned int value; /* the UNICODE value for the character */ 00079 const char *name; /* The entity name */ 00080 const char *desc; /* the description */ 00081 }; 00082 00083 /* 00084 * There is only few public functions. 00085 */ 00086 XMLPUBFUN const htmlElemDesc * XMLCALL 00087 htmlTagLookup (const xmlChar *tag); 00088 XMLPUBFUN const htmlEntityDesc * XMLCALL 00089 htmlEntityLookup(const xmlChar *name); 00090 XMLPUBFUN const htmlEntityDesc * XMLCALL 00091 htmlEntityValueLookup(unsigned int value); 00092 00093 XMLPUBFUN int XMLCALL 00094 htmlIsAutoClosed(htmlDocPtr doc, 00095 htmlNodePtr elem); 00096 XMLPUBFUN int XMLCALL 00097 htmlAutoCloseTag(htmlDocPtr doc, 00098 const xmlChar *name, 00099 htmlNodePtr elem); 00100 XMLPUBFUN const htmlEntityDesc * XMLCALL 00101 htmlParseEntityRef(htmlParserCtxtPtr ctxt, 00102 const xmlChar **str); 00103 XMLPUBFUN int XMLCALL 00104 htmlParseCharRef(htmlParserCtxtPtr ctxt); 00105 XMLPUBFUN void XMLCALL 00106 htmlParseElement(htmlParserCtxtPtr ctxt); 00107 00108 XMLPUBFUN htmlParserCtxtPtr XMLCALL 00109 htmlNewParserCtxt(void); 00110 00111 XMLPUBFUN htmlParserCtxtPtr XMLCALL 00112 htmlCreateMemoryParserCtxt(const char *buffer, 00113 int size); 00114 00115 XMLPUBFUN int XMLCALL 00116 htmlParseDocument(htmlParserCtxtPtr ctxt); 00117 XMLPUBFUN htmlDocPtr XMLCALL 00118 htmlSAXParseDoc (xmlChar *cur, 00119 const char *encoding, 00120 htmlSAXHandlerPtr sax, 00121 void *userData); 00122 XMLPUBFUN htmlDocPtr XMLCALL 00123 htmlParseDoc (xmlChar *cur, 00124 const char *encoding); 00125 XMLPUBFUN htmlDocPtr XMLCALL 00126 htmlSAXParseFile(const char *filename, 00127 const char *encoding, 00128 htmlSAXHandlerPtr sax, 00129 void *userData); 00130 XMLPUBFUN htmlDocPtr XMLCALL 00131 htmlParseFile (const char *filename, 00132 const char *encoding); 00133 XMLPUBFUN int XMLCALL 00134 UTF8ToHtml (unsigned char *out, 00135 int *outlen, 00136 const unsigned char *in, 00137 int *inlen); 00138 XMLPUBFUN int XMLCALL 00139 htmlEncodeEntities(unsigned char *out, 00140 int *outlen, 00141 const unsigned char *in, 00142 int *inlen, int quoteChar); 00143 XMLPUBFUN int XMLCALL 00144 htmlIsScriptAttribute(const xmlChar *name); 00145 XMLPUBFUN int XMLCALL 00146 htmlHandleOmittedElem(int val); 00147 00148 #ifdef LIBXML_PUSH_ENABLED 00149 00152 XMLPUBFUN htmlParserCtxtPtr XMLCALL 00153 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, 00154 void *user_data, 00155 const char *chunk, 00156 int size, 00157 const char *filename, 00158 xmlCharEncoding enc); 00159 XMLPUBFUN int XMLCALL 00160 htmlParseChunk (htmlParserCtxtPtr ctxt, 00161 const char *chunk, 00162 int size, 00163 int terminate); 00164 #endif /* LIBXML_PUSH_ENABLED */ 00165 00166 XMLPUBFUN void XMLCALL 00167 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); 00168 00169 /* 00170 * New set of simpler/more flexible APIs 00171 */ 00178 typedef enum { 00179 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ 00180 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ 00181 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ 00182 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ 00183 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ 00184 HTML_PARSE_NONET = 1<<11,/* Forbid network access */ 00185 HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ 00186 HTML_PARSE_COMPACT = 1<<16 /* compact small text nodes */ 00187 } htmlParserOption; 00188 00189 XMLPUBFUN void XMLCALL 00190 htmlCtxtReset (htmlParserCtxtPtr ctxt); 00191 XMLPUBFUN int XMLCALL 00192 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, 00193 int options); 00194 XMLPUBFUN htmlDocPtr XMLCALL 00195 htmlReadDoc (const xmlChar *cur, 00196 const char *URL, 00197 const char *encoding, 00198 int options); 00199 XMLPUBFUN htmlDocPtr XMLCALL 00200 htmlReadFile (const char *URL, 00201 const char *encoding, 00202 int options); 00203 XMLPUBFUN htmlDocPtr XMLCALL 00204 htmlReadMemory (const char *buffer, 00205 int size, 00206 const char *URL, 00207 const char *encoding, 00208 int options); 00209 XMLPUBFUN htmlDocPtr XMLCALL 00210 htmlReadFd (int fd, 00211 const char *URL, 00212 const char *encoding, 00213 int options); 00214 XMLPUBFUN htmlDocPtr XMLCALL 00215 htmlReadIO (xmlInputReadCallback ioread, 00216 xmlInputCloseCallback ioclose, 00217 void *ioctx, 00218 const char *URL, 00219 const char *encoding, 00220 int options); 00221 XMLPUBFUN htmlDocPtr XMLCALL 00222 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, 00223 const xmlChar *cur, 00224 const char *URL, 00225 const char *encoding, 00226 int options); 00227 XMLPUBFUN htmlDocPtr XMLCALL 00228 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, 00229 const char *filename, 00230 const char *encoding, 00231 int options); 00232 XMLPUBFUN htmlDocPtr XMLCALL 00233 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, 00234 const char *buffer, 00235 int size, 00236 const char *URL, 00237 const char *encoding, 00238 int options); 00239 XMLPUBFUN htmlDocPtr XMLCALL 00240 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, 00241 int fd, 00242 const char *URL, 00243 const char *encoding, 00244 int options); 00245 XMLPUBFUN htmlDocPtr XMLCALL 00246 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, 00247 xmlInputReadCallback ioread, 00248 xmlInputCloseCallback ioclose, 00249 void *ioctx, 00250 const char *URL, 00251 const char *encoding, 00252 int options); 00253 00254 /* NRK/Jan2003: further knowledge of HTML structure 00255 */ 00256 typedef enum { 00257 HTML_NA = 0 , /* something we don't check at all */ 00258 HTML_INVALID = 0x1 , 00259 HTML_DEPRECATED = 0x2 , 00260 HTML_VALID = 0x4 , 00261 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ 00262 } htmlStatus ; 00263 00264 /* Using htmlElemDesc rather than name here, to emphasise the fact 00265 that otherwise there's a lookup overhead 00266 */ 00267 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; 00268 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; 00269 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; 00270 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; 00277 #define htmlDefaultSubelement(elt) elt->defaultsubelt 00278 00288 #define htmlElementAllowedHereDesc(parent,elt) \ 00289 htmlElementAllowedHere((parent), (elt)->name) 00290 00296 #define htmlRequiredAttrs(elt) (elt)->attrs_req 00297 00298 00299 #ifdef __cplusplus 00300 } 00301 #endif 00302 00303 #endif /* LIBXML_HTML_ENABLED */ 00304 #endif /* __HTML_PARSER_H__ */