Eneboo - Documentación para desarrolladores
src/libdigidoc/libxml2/include/libxml/HTMLparser.h
Ir a la documentación de este archivo.
00001 /*
00002  * Summary: interface for an HTML 4.0 non-verifying parser
00003  * Description: this module implements an HTML 4.0 non-verifying parser
00004  *              with API compatible with the XML parser ones. It should
00005  *              be able to parse "real world" HTML, even if severely
00006  *              broken from a specification point of view.
00007  *
00008  * Copy: See Copyright for the status of this software.
00009  *
00010  * Author: Daniel Veillard
00011  */
00012 
00013 #ifndef __HTML_PARSER_H__
00014 #define __HTML_PARSER_H__
00015 #include <libxml/xmlversion.h>
00016 #include <libxml/parser.h>
00017 
00018 #ifdef LIBXML_HTML_ENABLED
00019 
00020 #ifdef __cplusplus
00021 extern "C" {
00022 #endif
00023 
00024 /*
00025  * Most of the back-end structures from XML and HTML are shared.
00026  */
00027 typedef xmlParserCtxt htmlParserCtxt;
00028 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
00029 typedef xmlParserNodeInfo htmlParserNodeInfo;
00030 typedef xmlSAXHandler htmlSAXHandler;
00031 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
00032 typedef xmlParserInput htmlParserInput;
00033 typedef xmlParserInputPtr htmlParserInputPtr;
00034 typedef xmlDocPtr htmlDocPtr;
00035 typedef xmlNodePtr htmlNodePtr;
00036 
00037 /*
00038  * Internal description of an HTML element, representing HTML 4.01
00039  * and XHTML 1.0 (which share the same structure).
00040  */
00041 typedef struct _htmlElemDesc htmlElemDesc;
00042 typedef htmlElemDesc *htmlElemDescPtr;
00043 struct _htmlElemDesc {
00044     const char *name;   /* The tag name */
00045     char startTag;      /* Whether the start tag can be implied */
00046     char endTag;        /* Whether the end tag can be implied */
00047     char saveEndTag;    /* Whether the end tag should be saved */
00048     char empty;         /* Is this an empty element ? */
00049     char depr;          /* Is this a deprecated element ? */
00050     char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
00051     char isinline;      /* is this a block 0 or inline 1 element */
00052     const char *desc;   /* the description */
00053 
00054 /* NRK Jan.2003
00055  * New fields encapsulating HTML structure
00056  *
00057  * Bugs:
00058  *      This is a very limited representation.  It fails to tell us when
00059  *      an element *requires* subelements (we only have whether they're
00060  *      allowed or not), and it doesn't tell us where CDATA and PCDATA
00061  *      are allowed.  Some element relationships are not fully represented:
00062  *      these are flagged with the word MODIFIER
00063  */
00064     const char** subelts;               /* allowed sub-elements of this element */
00065     const char* defaultsubelt;  /* subelement for suggested auto-repair
00066                                            if necessary or NULL */
00067     const char** attrs_opt;             /* Optional Attributes */
00068     const char** attrs_depr;            /* Additional deprecated attributes */
00069     const char** attrs_req;             /* Required attributes */
00070 };
00071 
00072 /*
00073  * Internal description of an HTML entity.
00074  */
00075 typedef struct _htmlEntityDesc htmlEntityDesc;
00076 typedef htmlEntityDesc *htmlEntityDescPtr;
00077 struct _htmlEntityDesc {
00078     unsigned int value; /* the UNICODE value for the character */
00079     const char *name;   /* The entity name */
00080     const char *desc;   /* the description */
00081 };
00082 
00083 /*
00084  * There is only few public functions.
00085  */
00086 XMLPUBFUN const htmlElemDesc * XMLCALL  
00087                         htmlTagLookup   (const xmlChar *tag);
00088 XMLPUBFUN const htmlEntityDesc * XMLCALL        
00089                         htmlEntityLookup(const xmlChar *name);
00090 XMLPUBFUN const htmlEntityDesc * XMLCALL        
00091                         htmlEntityValueLookup(unsigned int value);
00092 
00093 XMLPUBFUN int XMLCALL                   
00094                         htmlIsAutoClosed(htmlDocPtr doc,
00095                                          htmlNodePtr elem);
00096 XMLPUBFUN int XMLCALL                   
00097                         htmlAutoCloseTag(htmlDocPtr doc,
00098                                          const xmlChar *name,
00099                                          htmlNodePtr elem);
00100 XMLPUBFUN const htmlEntityDesc * XMLCALL        
00101                         htmlParseEntityRef(htmlParserCtxtPtr ctxt,
00102                                          const xmlChar **str);
00103 XMLPUBFUN int XMLCALL                   
00104                         htmlParseCharRef(htmlParserCtxtPtr ctxt);
00105 XMLPUBFUN void XMLCALL                  
00106                         htmlParseElement(htmlParserCtxtPtr ctxt);
00107 
00108 XMLPUBFUN htmlParserCtxtPtr XMLCALL     
00109                         htmlNewParserCtxt(void);
00110 
00111 XMLPUBFUN htmlParserCtxtPtr XMLCALL     
00112                         htmlCreateMemoryParserCtxt(const char *buffer,
00113                                                    int size);
00114 
00115 XMLPUBFUN int XMLCALL                   
00116                         htmlParseDocument(htmlParserCtxtPtr ctxt);
00117 XMLPUBFUN htmlDocPtr XMLCALL            
00118                         htmlSAXParseDoc (xmlChar *cur,
00119                                          const char *encoding,
00120                                          htmlSAXHandlerPtr sax,
00121                                          void *userData);
00122 XMLPUBFUN htmlDocPtr XMLCALL            
00123                         htmlParseDoc    (xmlChar *cur,
00124                                          const char *encoding);
00125 XMLPUBFUN htmlDocPtr XMLCALL            
00126                         htmlSAXParseFile(const char *filename,
00127                                          const char *encoding,
00128                                          htmlSAXHandlerPtr sax,
00129                                          void *userData);
00130 XMLPUBFUN htmlDocPtr XMLCALL            
00131                         htmlParseFile   (const char *filename,
00132                                          const char *encoding);
00133 XMLPUBFUN int XMLCALL                   
00134                         UTF8ToHtml      (unsigned char *out,
00135                                          int *outlen,
00136                                          const unsigned char *in,
00137                                          int *inlen);
00138 XMLPUBFUN int XMLCALL                   
00139                         htmlEncodeEntities(unsigned char *out,
00140                                          int *outlen,
00141                                          const unsigned char *in,
00142                                          int *inlen, int quoteChar);
00143 XMLPUBFUN int XMLCALL                   
00144                         htmlIsScriptAttribute(const xmlChar *name);
00145 XMLPUBFUN int XMLCALL                   
00146                         htmlHandleOmittedElem(int val);
00147 
00148 #ifdef LIBXML_PUSH_ENABLED
00149 
00152 XMLPUBFUN htmlParserCtxtPtr XMLCALL     
00153                         htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
00154                                                  void *user_data,
00155                                                  const char *chunk,
00156                                                  int size,
00157                                                  const char *filename,
00158                                                  xmlCharEncoding enc);
00159 XMLPUBFUN int XMLCALL                   
00160                         htmlParseChunk          (htmlParserCtxtPtr ctxt,
00161                                                  const char *chunk,
00162                                                  int size,
00163                                                  int terminate);
00164 #endif /* LIBXML_PUSH_ENABLED */
00165 
00166 XMLPUBFUN void XMLCALL                  
00167                         htmlFreeParserCtxt      (htmlParserCtxtPtr ctxt);
00168 
00169 /*
00170  * New set of simpler/more flexible APIs
00171  */
00178 typedef enum {
00179     HTML_PARSE_RECOVER  = 1<<0, /* Relaxed parsing */
00180     HTML_PARSE_NOERROR  = 1<<5, /* suppress error reports */
00181     HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
00182     HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
00183     HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
00184     HTML_PARSE_NONET    = 1<<11,/* Forbid network access */
00185     HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
00186     HTML_PARSE_COMPACT  = 1<<16 /* compact small text nodes */
00187 } htmlParserOption;
00188 
00189 XMLPUBFUN void XMLCALL
00190                 htmlCtxtReset           (htmlParserCtxtPtr ctxt);
00191 XMLPUBFUN int XMLCALL
00192                 htmlCtxtUseOptions      (htmlParserCtxtPtr ctxt,
00193                                          int options);
00194 XMLPUBFUN htmlDocPtr XMLCALL
00195                 htmlReadDoc             (const xmlChar *cur,
00196                                          const char *URL,
00197                                          const char *encoding,
00198                                          int options);
00199 XMLPUBFUN htmlDocPtr XMLCALL
00200                 htmlReadFile            (const char *URL,
00201                                          const char *encoding,
00202                                          int options);
00203 XMLPUBFUN htmlDocPtr XMLCALL
00204                 htmlReadMemory          (const char *buffer,
00205                                          int size,
00206                                          const char *URL,
00207                                          const char *encoding,
00208                                          int options);
00209 XMLPUBFUN htmlDocPtr XMLCALL
00210                 htmlReadFd              (int fd,
00211                                          const char *URL,
00212                                          const char *encoding,
00213                                          int options);
00214 XMLPUBFUN htmlDocPtr XMLCALL
00215                 htmlReadIO              (xmlInputReadCallback ioread,
00216                                          xmlInputCloseCallback ioclose,
00217                                          void *ioctx,
00218                                          const char *URL,
00219                                          const char *encoding,
00220                                          int options);
00221 XMLPUBFUN htmlDocPtr XMLCALL
00222                 htmlCtxtReadDoc         (xmlParserCtxtPtr ctxt,
00223                                          const xmlChar *cur,
00224                                          const char *URL,
00225                                          const char *encoding,
00226                                          int options);
00227 XMLPUBFUN htmlDocPtr XMLCALL
00228                 htmlCtxtReadFile                (xmlParserCtxtPtr ctxt,
00229                                          const char *filename,
00230                                          const char *encoding,
00231                                          int options);
00232 XMLPUBFUN htmlDocPtr XMLCALL
00233                 htmlCtxtReadMemory              (xmlParserCtxtPtr ctxt,
00234                                          const char *buffer,
00235                                          int size,
00236                                          const char *URL,
00237                                          const char *encoding,
00238                                          int options);
00239 XMLPUBFUN htmlDocPtr XMLCALL
00240                 htmlCtxtReadFd          (xmlParserCtxtPtr ctxt,
00241                                          int fd,
00242                                          const char *URL,
00243                                          const char *encoding,
00244                                          int options);
00245 XMLPUBFUN htmlDocPtr XMLCALL
00246                 htmlCtxtReadIO          (xmlParserCtxtPtr ctxt,
00247                                          xmlInputReadCallback ioread,
00248                                          xmlInputCloseCallback ioclose,
00249                                          void *ioctx,
00250                                          const char *URL,
00251                                          const char *encoding,
00252                                          int options);
00253 
00254 /* NRK/Jan2003: further knowledge of HTML structure
00255  */
00256 typedef enum {
00257   HTML_NA = 0 ,         /* something we don't check at all */
00258   HTML_INVALID = 0x1 ,
00259   HTML_DEPRECATED = 0x2 ,
00260   HTML_VALID = 0x4 ,
00261   HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
00262 } htmlStatus ;
00263 
00264 /* Using htmlElemDesc rather than name here, to emphasise the fact
00265    that otherwise there's a lookup overhead
00266 */
00267 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
00268 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
00269 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
00270 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;
00277 #define htmlDefaultSubelement(elt) elt->defaultsubelt
00278 
00288 #define htmlElementAllowedHereDesc(parent,elt) \
00289         htmlElementAllowedHere((parent), (elt)->name)
00290 
00296 #define htmlRequiredAttrs(elt) (elt)->attrs_req
00297 
00298 
00299 #ifdef __cplusplus
00300 }
00301 #endif
00302 
00303 #endif /* LIBXML_HTML_ENABLED */
00304 #endif /* __HTML_PARSER_H__ */
 Todo Clases Namespaces Archivos Funciones Variables 'typedefs' Enumeraciones Valores de enumeraciones Propiedades Amigas 'defines'