/* tags.c -- recognize HTML tags (c) 1998-2001 (W3C) MIT, INRIA, Keio University See tidy.c for the copyright notice. The HTML tags are stored as 8 bit ASCII strings. Use lookupw() to find a tag given a wide char string. CVS Info : $Author: terry_teague $ $Date: 2001/09/01 04:15:40 $ $Revision: 1.19 $ */ #include "platform.h" /* platform independent stuff */ #include "html.h" /* to pull in definition of nodes */ #define HASHSIZE 357 extern Bool XmlTags; Dict *tag_html; Dict *tag_head; Dict *tag_title; Dict *tag_base; Dict *tag_meta; Dict *tag_body; Dict *tag_frameset; Dict *tag_frame; Dict *tag_iframe; /* #433359 - fix by Randy Waki 12 Mar 01 */ Dict *tag_noframes; Dict *tag_hr; Dict *tag_h1; Dict *tag_h2; Dict *tag_pre; Dict *tag_listing; Dict *tag_p; Dict *tag_ul; Dict *tag_ol; Dict *tag_dl; Dict *tag_dir; Dict *tag_li; Dict *tag_dt; Dict *tag_dd; Dict *tag_td; Dict *tag_th; Dict *tag_tr; Dict *tag_col; Dict *tag_br; Dict *tag_a; Dict *tag_link; Dict *tag_b; Dict *tag_i; Dict *tag_strong; Dict *tag_em; Dict *tag_big; Dict *tag_small; Dict *tag_param; Dict *tag_option; Dict *tag_optgroup; Dict *tag_img; Dict *tag_map; Dict *tag_area; Dict *tag_nobr; Dict *tag_wbr; Dict *tag_font; Dict *tag_layer; Dict *tag_spacer; Dict *tag_center; Dict *tag_style; Dict *tag_script; Dict *tag_noscript; Dict *tag_table; Dict *tag_caption; Dict *tag_form; Dict *tag_textarea; Dict *tag_blockquote; Dict *tag_applet; Dict *tag_object; Dict *tag_div; Dict *tag_span; Dict *tag_input; Dict *tag_q; Dict *xml_tags; /* dummy for xml tags */ static Dict *hashtab[HASHSIZE]; /* used by FindFirstDefinedTag and FindNextDefinedTag */ static Dict *tag_blink; /* a proprietary tag added by Tidy, along with tag_nobr, tag_wbr */ static Dict *curDictEntry; static int curHashIndex; static struct tag { char *name; unsigned versions; unsigned model; Parser *parser; CheckAttribs *chkattrs; } tags[] = { {"html", VERS_ALL, (CM_HTML|CM_OPT|CM_OMITST), ParseHTML, CheckHTML}, {"head", VERS_ALL, (CM_HTML|CM_OPT|CM_OMITST), ParseHead, null}, {"title", VERS_ALL, CM_HEAD, ParseTitle, null}, {"base", VERS_ALL, (CM_HEAD|CM_EMPTY), ParseEmpty, null}, {"link", VERS_ALL, (CM_HEAD|CM_EMPTY), ParseEmpty, CheckLINK}, {"meta", VERS_ALL, (CM_HEAD|CM_EMPTY), ParseEmpty, CheckMETA}, {"style", (VERS_FROM32)&~VERS_BASIC, CM_HEAD, ParseScript, CheckSTYLE}, {"script", (VERS_FROM32)&~VERS_BASIC, (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE), ParseScript, CheckSCRIPT}, {"server", VERS_NETSCAPE, (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE), ParseScript, null}, {"body", VERS_ALL, (CM_HTML|CM_OPT|CM_OMITST), ParseBody, null}, {"frameset", VERS_FRAMESET, (CM_HTML|CM_FRAMES), ParseFrameSet, null}, {"p", VERS_ALL, (CM_BLOCK|CM_OPT), ParseInline, null}, {"h1", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null}, {"h2", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null}, {"h3", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null}, {"h4", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null}, {"h5", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null}, {"h6", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null}, {"ul", VERS_ALL, CM_BLOCK, ParseList, null}, {"ol", VERS_ALL, CM_BLOCK, ParseList, null}, {"dl", VERS_ALL, CM_BLOCK, ParseDefList, null}, {"dir", VERS_LOOSE, (CM_BLOCK|CM_OBSOLETE), ParseList, null}, {"menu", VERS_LOOSE, (CM_BLOCK|CM_OBSOLETE), ParseList, null}, {"pre", VERS_ALL, CM_BLOCK, ParsePre, null}, {"listing", VERS_ALL, (CM_BLOCK|CM_OBSOLETE), ParsePre, null}, {"xmp", VERS_ALL, (CM_BLOCK|CM_OBSOLETE), ParsePre, null}, {"plaintext", VERS_ALL, (CM_BLOCK|CM_OBSOLETE), ParsePre, null}, {"address", VERS_ALL, CM_BLOCK, ParseBlock, null}, {"blockquote", VERS_ALL, CM_BLOCK, ParseBlock, null}, {"form", VERS_ALL, CM_BLOCK, ParseBlock, CheckFORM}, {"isindex", VERS_LOOSE, (CM_BLOCK|CM_EMPTY), ParseEmpty, null}, {"fieldset", (VERS_HTML40)&~VERS_BASIC, CM_BLOCK, ParseBlock, null}, {"table", VERS_FROM32, CM_BLOCK, ParseTableTag, CheckTABLE}, {"hr", (VERS_ALL)&~VERS_BASIC, (CM_BLOCK|CM_EMPTY), ParseEmpty, CheckHR}, {"div", VERS_FROM32, CM_BLOCK, ParseBlock, null}, {"multicol", VERS_NETSCAPE, CM_BLOCK, ParseBlock, null}, {"nosave", VERS_NETSCAPE, CM_BLOCK, ParseBlock, null}, {"layer", VERS_NETSCAPE, CM_BLOCK, ParseBlock, null}, {"ilayer", VERS_NETSCAPE, CM_INLINE, ParseInline, null}, {"nolayer", VERS_NETSCAPE, (CM_BLOCK|CM_INLINE|CM_MIXED), ParseBlock, null}, {"align", VERS_NETSCAPE, CM_BLOCK, ParseBlock, null}, {"center", VERS_LOOSE, CM_BLOCK, ParseBlock, null}, {"ins", (VERS_HTML40)&~VERS_BASIC, (CM_INLINE|CM_BLOCK|CM_MIXED), ParseInline, null}, {"del", (VERS_HTML40)&~VERS_BASIC, (CM_INLINE|CM_BLOCK|CM_MIXED), ParseInline, null}, {"li", VERS_ALL, (CM_LIST|CM_OPT|CM_NO_INDENT), ParseBlock, null}, {"dt", VERS_ALL, (CM_DEFLIST|CM_OPT|CM_NO_INDENT), ParseInline, null}, {"dd", VERS_ALL, (CM_DEFLIST|CM_OPT|CM_NO_INDENT), ParseBlock, null}, {"caption", VERS_FROM32, CM_TABLE, ParseInline, CheckCaption}, {"colgroup", VERS_HTML40, (CM_TABLE|CM_OPT), ParseColGroup, null}, {"col", VERS_HTML40, (CM_TABLE|CM_EMPTY), ParseEmpty, null}, {"thead", (VERS_HTML40)&~VERS_BASIC, (CM_TABLE|CM_ROWGRP|CM_OPT), ParseRowGroup, null}, {"tfoot", (VERS_HTML40)&~VERS_BASIC, (CM_TABLE|CM_ROWGRP|CM_OPT), ParseRowGroup, null}, {"tbody", (VERS_HTML40)&~VERS_BASIC, (CM_TABLE|CM_ROWGRP|CM_OPT), ParseRowGroup, null}, {"tr", VERS_FROM32, (CM_TABLE|CM_OPT), ParseRow, null}, {"td", VERS_FROM32, (CM_ROW|CM_OPT|CM_NO_INDENT), ParseBlock, CheckTableCell}, {"th", VERS_FROM32, (CM_ROW|CM_OPT|CM_NO_INDENT), ParseBlock, CheckTableCell}, {"q", VERS_HTML40, CM_INLINE, ParseInline, null}, {"a", VERS_ALL, CM_INLINE, ParseInline, CheckAnchor}, {"br", VERS_ALL, (CM_INLINE|CM_EMPTY), ParseEmpty, null}, {"img", VERS_ALL, (CM_INLINE|CM_IMG|CM_EMPTY), ParseEmpty, CheckIMG}, {"object", VERS_HTML40, (CM_OBJECT|CM_HEAD|CM_IMG|CM_INLINE|CM_PARAM), ParseBlock, null}, {"applet", VERS_LOOSE, (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM), ParseBlock, null}, {"servlet", VERS_SUN, (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM), ParseBlock, null}, {"param", VERS_FROM32, (CM_INLINE|CM_EMPTY), ParseEmpty, null}, {"embed", VERS_NETSCAPE, (CM_INLINE|CM_IMG|CM_EMPTY), ParseEmpty, null}, {"noembed", VERS_NETSCAPE, CM_INLINE, ParseInline, null}, {"iframe", VERS_IFRAME, CM_INLINE, ParseBlock, null}, {"frame", VERS_FRAMESET, (CM_FRAMES|CM_EMPTY), ParseEmpty, null}, {"noframes", VERS_IFRAME, (CM_BLOCK|CM_FRAMES), ParseNoFrames, null}, {"noscript", (VERS_HTML40)&~VERS_BASIC, (CM_BLOCK|CM_INLINE|CM_MIXED), ParseBlock, null}, {"b", (VERS_ALL)&~VERS_BASIC, CM_INLINE, ParseInline, null}, {"i", (VERS_ALL)&~VERS_BASIC, CM_INLINE, ParseInline, null}, {"u", VERS_LOOSE, CM_INLINE, ParseInline, null}, {"tt", (VERS_ALL)&~VERS_BASIC, CM_INLINE, ParseInline, null}, {"s", VERS_LOOSE, CM_INLINE, ParseInline, null}, {"strike", VERS_LOOSE, CM_INLINE, ParseInline, null}, {"big", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseInline, null}, {"small", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseInline, null}, {"sub", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseInline, null}, {"sup", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseInline, null}, {"em", VERS_ALL, CM_INLINE, ParseInline, null}, {"strong", VERS_ALL, CM_INLINE, ParseInline, null}, {"dfn", VERS_ALL, CM_INLINE, ParseInline, null}, {"code", VERS_ALL, CM_INLINE, ParseInline, null}, {"samp", VERS_ALL, CM_INLINE, ParseInline, null}, {"kbd", VERS_ALL, CM_INLINE, ParseInline, null}, {"var", VERS_ALL, CM_INLINE, ParseInline, null}, {"cite", VERS_ALL, CM_INLINE, ParseInline, null}, {"abbr", VERS_HTML40, CM_INLINE, ParseInline, null}, {"acronym", VERS_HTML40, CM_INLINE, ParseInline, null}, {"span", VERS_FROM32, CM_INLINE, ParseInline, null}, {"blink", VERS_PROPRIETARY, CM_INLINE, ParseInline, null}, {"nobr", VERS_PROPRIETARY, CM_INLINE, ParseInline, null}, {"wbr", VERS_PROPRIETARY, (CM_INLINE|CM_EMPTY), ParseEmpty, null}, {"marquee", VERS_MICROSOFT, (CM_INLINE|CM_OPT), ParseInline, null}, {"bgsound", VERS_MICROSOFT, (CM_HEAD|CM_EMPTY), ParseEmpty, null}, {"comment", VERS_MICROSOFT, CM_INLINE, ParseInline, null}, {"spacer", VERS_NETSCAPE, (CM_INLINE|CM_EMPTY), ParseEmpty, null}, {"keygen", VERS_NETSCAPE, (CM_INLINE|CM_EMPTY), ParseEmpty, null}, /* next 2 are already defined above - does no harm though */ {"nolayer", VERS_NETSCAPE, (CM_BLOCK|CM_INLINE|CM_MIXED), ParseBlock, null}, {"ilayer", VERS_NETSCAPE, CM_INLINE, ParseInline, null}, {"map", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseBlock, CheckMap}, {"area", (VERS_ALL)&~VERS_BASIC, (CM_BLOCK|CM_EMPTY), ParseEmpty, CheckAREA}, {"input", VERS_ALL, (CM_INLINE|CM_IMG|CM_EMPTY), ParseEmpty, null}, {"select", VERS_ALL, (CM_INLINE|CM_FIELD), ParseSelect, null}, {"option", VERS_ALL, (CM_FIELD|CM_OPT), ParseText, null}, {"optgroup", (VERS_HTML40)&~VERS_BASIC, (CM_FIELD|CM_OPT), ParseOptGroup, null}, {"textarea", VERS_ALL, (CM_INLINE|CM_FIELD), ParseText, null}, {"label", VERS_HTML40, CM_INLINE, ParseInline, null}, {"legend", (VERS_HTML40)&~VERS_BASIC, CM_INLINE, ParseInline, null}, {"button", (VERS_HTML40)&~VERS_BASIC, CM_INLINE, ParseInline, null}, {"basefont", VERS_LOOSE, (CM_INLINE|CM_EMPTY), ParseEmpty, null}, {"font", VERS_LOOSE, CM_INLINE, ParseInline, null}, {"bdo", (VERS_HTML40)&~VERS_BASIC, CM_INLINE, ParseInline, null}, /* elements for XHTML 1.1 */ {"ruby", VERS_XHTML11, CM_INLINE, ParseInline, null}, {"rbc", VERS_XHTML11, CM_INLINE, ParseInline, null}, {"rtc", VERS_XHTML11, CM_INLINE, ParseInline, null}, {"rb", VERS_XHTML11, CM_INLINE, ParseInline, null}, {"rt", VERS_XHTML11, CM_INLINE, ParseInline, null}, {"rp", VERS_XHTML11, CM_INLINE, ParseInline, null}, /* this must be the final entry */ {null, 0, 0, 0, 0} }; /* choose what version to use for new doctype */ int HTMLVersion(Lexer *lexer) { uint versions; versions = lexer->versions; if (versions & VERS_HTML20) return VERS_HTML20; if (!(XmlOut|XmlTags|lexer->isvoyager) && versions & VERS_HTML32) return VERS_HTML32; if (versions & VERS_XHTML11) return VERS_XHTML11; if (versions & VERS_HTML40_STRICT) return VERS_HTML40_STRICT; if (versions & VERS_HTML40_LOOSE) return VERS_HTML40_LOOSE; if (versions & VERS_FRAMESET) return VERS_FRAMESET; return VERS_UNKNOWN; } static unsigned hash(char *s) { unsigned hashval; for (hashval = 0; *s != '\0'; s++) hashval = *s + 31*hashval; return hashval % HASHSIZE; } static Dict *lookup(char *s) { Dict *np; for (np = hashtab[hash(s)]; np != null; np = np->next) if (wstrcmp(s, np->name) == 0) return np; return null; } static Dict *install(char *name, uint versions, uint model, Parser *parser, CheckAttribs *chkattrs) { Dict *np; unsigned hashval; if ((np = lookup(name)) == null) { np = (Dict *)MemAlloc(sizeof(*np)); if (np == null || (np->name = wstrdup(name)) == null) return null; hashval = hash(name); np->next = hashtab[hashval]; np->model = 0; hashtab[hashval] = np; } np->versions = versions; np->model |= model; np->parser = parser; np->chkattrs = chkattrs; return np; } /* public interface for finding tag by name */ Bool FindTag(Node *node) { Dict *np; if (XmlTags) { node->tag = xml_tags; return yes; } if (node->element && (np = lookup(node->element))) { node->tag = np; return yes; } return no; } Parser *FindParser(Node *node) { Dict *np; if (node->element && (np = lookup(node->element))) return np->parser; return null; } void DefineTag(int tagType, char *name) { switch (tagType) { case tagtype_empty : install(name, VERS_PROPRIETARY, (CM_EMPTY|CM_NO_INDENT|CM_NEW), ParseBlock, null); break; case tagtype_inline : install(name, VERS_PROPRIETARY, (CM_INLINE|CM_NO_INDENT|CM_NEW), ParseInline, null); break; case tagtype_block : install(name, VERS_PROPRIETARY, (CM_BLOCK|CM_NO_INDENT|CM_NEW), ParseBlock, null); break; case tagtype_pre : install(name, VERS_PROPRIETARY, (CM_BLOCK|CM_NO_INDENT|CM_NEW), ParsePre, null); break; } } void ResetDefinedTagSearch(void) { curDictEntry = null; curHashIndex = 0; } char *FindNextDefinedTag(int tagType) { char *tagName = null; do { if (curDictEntry != null) { switch (tagType) { /* defined tags can be empty + inline */ case tagtype_empty : if ((curDictEntry->versions == VERS_PROPRIETARY) && ((curDictEntry->model & CM_EMPTY) == CM_EMPTY) && /* (curDictEntry->parser == ParseBlock) && */ (curDictEntry != tag_wbr)) tagName = curDictEntry->name; break; /* defined tags can be empty + inline */ case tagtype_inline : if ((curDictEntry->versions == VERS_PROPRIETARY) && ((curDictEntry->model & CM_INLINE) == CM_INLINE) && /* (curDictEntry->parser == ParseInline) && */ (curDictEntry != tag_blink) && (curDictEntry != tag_nobr) && (curDictEntry != tag_wbr)) tagName = curDictEntry->name; break; /* defined tags can be empty + block */ case tagtype_block : if ((curDictEntry->versions == VERS_PROPRIETARY) && ((curDictEntry->model & CM_BLOCK) == CM_BLOCK) && (curDictEntry->parser == ParseBlock)) tagName = curDictEntry->name; break; case tagtype_pre : if ((curDictEntry->versions == VERS_PROPRIETARY) && ((curDictEntry->model & CM_BLOCK) == CM_BLOCK) && (curDictEntry->parser == ParsePre)) tagName = curDictEntry->name; break; } curDictEntry = curDictEntry->next; } if (curDictEntry == null) do { curDictEntry = hashtab[curHashIndex++]; } while ((curDictEntry == null) && (curHashIndex < HASHSIZE)); } while ((tagName == null) && (curDictEntry != null)); return tagName; } void InitTags(void) { struct tag *tp; for(tp = tags; tp->name != null; ++tp) install(tp->name, tp->versions, tp->model, tp->parser, tp->chkattrs); tag_html = lookup("html"); tag_head = lookup("head"); tag_body = lookup("body"); tag_frameset = lookup("frameset"); tag_frame = lookup("frame"); tag_iframe = lookup("iframe"); /* #433359 - fix by Randy Waki 12 Mar 01 */ tag_noframes = lookup("noframes"); tag_meta = lookup("meta"); tag_title = lookup("title"); tag_base = lookup("base"); tag_hr = lookup("hr"); tag_pre = lookup("pre"); tag_listing = lookup("listing"); tag_h1 = lookup("h1"); tag_h2 = lookup("h2"); tag_p = lookup("p"); tag_ul = lookup("ul"); tag_ol = lookup("ol"); tag_dir = lookup("dir"); tag_li = lookup("li"); tag_dl = lookup("dl"); tag_dt = lookup("dt"); tag_dd = lookup("dd"); tag_td = lookup("td"); tag_th = lookup("th"); tag_tr = lookup("tr"); tag_col = lookup("col"); tag_br = lookup("br"); tag_a = lookup("a"); tag_link = lookup("link"); tag_b = lookup("b"); tag_i = lookup("i"); tag_strong = lookup("strong"); tag_em = lookup("em"); tag_big = lookup("big"); tag_small = lookup("small"); tag_param = lookup("param"); tag_option = lookup("option"); tag_optgroup = lookup("optgroup"); tag_img = lookup("img"); tag_map = lookup("map"); tag_area = lookup("area"); tag_nobr = lookup("nobr"); tag_wbr = lookup("wbr"); tag_font = lookup("font"); tag_spacer = lookup("spacer"); tag_layer = lookup("layer"); tag_center = lookup("center"); tag_style = lookup("style"); tag_script = lookup("script"); tag_noscript = lookup("noscript"); tag_table = lookup("table"); tag_caption = lookup("caption"); tag_form = lookup("form"); tag_textarea = lookup("textarea"); tag_blockquote = lookup("blockquote"); tag_applet = lookup("applet"); tag_object = lookup("object"); tag_div = lookup("div"); tag_span = lookup("span"); tag_input = lookup("input"); tag_q = lookup("q"); /* create dummy entry for all xml tags */ xml_tags = (Dict *)MemAlloc(sizeof(*xml_tags)); xml_tags->name = null; xml_tags->versions = VERS_ALL; xml_tags->model = CM_BLOCK; xml_tags->parser = null; xml_tags->chkattrs = null; tag_blink = lookup("blink"); /* so we can skip this in the search for user defined tags */ } void FreeTags(void) { Dict *prev, *next; int i; MemFree(xml_tags); for (i = 0; i < HASHSIZE; ++i) { prev = null; next = hashtab[i]; while(next) { prev = next->next; MemFree(next->name); MemFree(next); next = prev; } hashtab[i] = null; } }