2 This file is part of the KDE libraries
4 Copyright (C) 1997 Martin Jones (mjones@kde.org)
5 (C) 1997 Torben Weis (weis@kde.org)
6 (C) 1999,2001 Lars Knoll (knoll@kde.org)
7 (C) 2000,2001 Dirk Mueller (mueller@kde.org)
8 Copyright (C) 2004, 2005, 2006 Apple Computer, Inc.
10 This library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Library General Public
12 License as published by the Free Software Foundation; either
13 version 2 of the License, or (at your option) any later version.
15 This library is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Library General Public License for more details.
20 You should have received a copy of the GNU Library General Public License
21 along with this library; see the file COPYING.LIB. If not, write to
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 Boston, MA 02111-1307, USA.
27 #include "HTMLParser.h"
29 #include "CSSPropertyNames.h"
30 #include "CSSValueKeywords.h"
32 #include "DocumentFragment.h"
34 #include "HTMLBodyElement.h"
35 #include "HTMLCanvasElement.h"
36 #include "HTMLDivElement.h"
37 #include "HTMLDListElement.h"
38 #include "HTMLElementFactory.h"
39 #include "HTMLFormElement.h"
40 #include "HTMLHeadElement.h"
41 #include "HTMLHRElement.h"
42 #include "HTMLHtmlElement.h"
43 #include "HTMLIsIndexElement.h"
44 #include "HTMLMapElement.h"
45 #include "HTMLNames.h"
46 #include "HTMLTableCellElement.h"
47 #include "HTMLTableRowElement.h"
48 #include "HTMLTableSectionElement.h"
49 #include "HTMLTokenizer.h"
50 #include "LocalizedStrings.h"
55 using namespace HTMLNames;
57 const UChar nonBreakingSpace = 0xa0;
63 class RefNonDocNodePtr
66 RefNonDocNodePtr() : m_ptr(0) {}
67 RefNonDocNodePtr(Node* ptr) : m_ptr(ptr), m_isDoc(ptr->isDocumentNode()) { if (!m_isDoc && ptr) ptr->ref(); }
68 RefNonDocNodePtr(const RefNonDocNodePtr& o) : m_ptr(o.m_ptr), m_isDoc(o.m_isDoc) { if (!m_isDoc && m_ptr) m_ptr->ref(); }
70 ~RefNonDocNodePtr() { if (!m_isDoc && m_ptr) m_ptr->deref(); }
72 Node *get() const { return m_ptr; }
74 Node& operator*() const { return *m_ptr; }
75 Node *operator->() const { return m_ptr; }
77 bool operator!() const { return !m_ptr; }
79 // This conversion operator allows implicit conversion to bool but not to other integer types.
80 typedef Node* (RefNonDocNodePtr::*UnspecifiedBoolType)() const;
81 operator UnspecifiedBoolType() const { return m_ptr ? &RefNonDocNodePtr::get : 0; }
83 RefNonDocNodePtr& operator=(const RefNonDocNodePtr&);
84 RefNonDocNodePtr& operator=(Node*);
85 RefNonDocNodePtr& operator=(RefPtr<Node>&);
92 inline RefNonDocNodePtr& RefNonDocNodePtr::operator=(const RefNonDocNodePtr& o)
95 if (!o.m_isDoc && optr)
105 inline RefNonDocNodePtr& RefNonDocNodePtr::operator=(Node* optr)
107 bool o_isDoc = optr->isDocumentNode();
108 if (!o_isDoc && optr)
118 inline RefNonDocNodePtr& RefNonDocNodePtr::operator=(RefPtr<Node>& o)
120 Node* optr = o.get();
121 bool o_isDoc = optr->isDocumentNode();
122 if (!o_isDoc && optr)
132 inline bool operator==(const RefNonDocNodePtr& a, const RefNonDocNodePtr& b)
134 return a.get() == b.get();
137 inline bool operator==(const RefNonDocNodePtr& a, Node* b)
142 inline bool operator==(Node* a, const RefNonDocNodePtr& b)
147 inline bool operator!=(const RefNonDocNodePtr& a, const RefNonDocNodePtr& b)
149 return a.get() != b.get();
152 inline bool operator!=(const RefNonDocNodePtr& a, Node* b)
157 inline bool operator!=(Node* a, const RefNonDocNodePtr& b)
167 HTMLStackElem(const AtomicString& _tagName,
170 HTMLStackElem * _next
175 strayTableContent(false),
180 AtomicString tagName;
182 bool strayTableContent;
183 RefNonDocNodePtr node;
190 * The parser parses tokenized input into the document, building up the
191 * document tree. If the document is wellformed, parsing it is
193 * Unfortunately, people can't write wellformed HTML documents, so the parser
194 * has to be tolerant about errors.
196 * We have to take care of the following error conditions:
197 * 1. The element being added is explicitly forbidden inside some outer tag.
198 * In this case we should close all tags up to the one, which forbids
199 * the element, and add it afterwards.
200 * 2. We are not allowed to add the element directly. It could be, that
201 * the person writing the document forgot some tag inbetween (or that the
202 * tag inbetween is optional...) This could be the case with the following
203 * tags: HTML HEAD BODY TBODY TR TD LI (did I forget any?)
204 * 3. We wan't to add a block element inside to an inline element. Close all
205 * inline elements up to the next higher block element.
206 * 4. If this doesn't help close elements, until we are allowed to add the
207 * element or ignore the tag.
210 HTMLParser::HTMLParser(Document* doc)
213 , currentIsReferenced(false)
220 HTMLParser::HTMLParser(DocumentFragment* frag)
221 : document(frag->document())
223 , currentIsReferenced(false)
232 HTMLParser::~HTMLParser()
239 void HTMLParser::reset()
246 haveFrameSet = false;
249 inStrayTableContent = 0;
257 discard_until = nullAtom;
260 void HTMLParser::setCurrent(Node *newCurrent)
262 bool newCurrentIsReferenced = newCurrent && newCurrent != doc();
263 if (newCurrentIsReferenced)
265 if (currentIsReferenced)
267 current = newCurrent;
268 currentIsReferenced = newCurrentIsReferenced;
271 PassRefPtr<Node> HTMLParser::parseToken(Token *t)
273 if (!discard_until.isNull()) {
274 if (t->tagName == discard_until && !t->beginTag)
275 discard_until = nullAtom;
277 // do not skip </iframe>
278 if (!discard_until.isNull() || (current->localName() != t->tagName))
282 // Apparently some sites use </br> instead of <br>. Be compatible with IE and Firefox and treat this like <br>.
283 if (t->isCloseTag(brTag) && doc()->inCompatMode())
291 // ignore spaces, if we're not inside a paragraph or other inline code
292 if (t->tagName == textAtom && t->text) {
293 if (inBody && !skipMode() && current->localName() != styleTag && current->localName() != titleTag &&
294 current->localName() != scriptTag && !t->text->containsOnlyWhitespace())
298 RefPtr<Node> n = getNode(t);
299 // just to be sure, and to catch currently unimplemented stuff
304 if (n->isHTMLElement()) {
305 HTMLElement* e = static_cast<HTMLElement*>(n.get());
306 e->setAttributeMap(t->attrs.get());
308 // take care of optional close tags
309 if (e->endTagRequirement() == TagStatusOptional)
310 popBlock(t->tagName);
312 if (isHeaderTag(t->tagName))
313 // Do not allow two header tags to be nested if the intervening tags are inlines.
314 popNestedHeaderTag();
317 if (!insertNode(n.get(), t->flat)) {
318 // we couldn't insert the node
320 if (n->isElementNode()) {
321 Element* e = static_cast<Element*>(n.get());
322 e->setAttributeMap(0);
339 static bool isTableSection(Node* n)
341 return n->hasTagName(tbodyTag) || n->hasTagName(tfootTag) || n->hasTagName(theadTag);
344 static bool isTablePart(Node* n)
346 return n->hasTagName(trTag) || n->hasTagName(tdTag) || n->hasTagName(thTag) ||
350 static bool isTableRelated(Node* n)
352 return n->hasTagName(tableTag) || isTablePart(n);
355 bool HTMLParser::insertNode(Node *n, bool flat)
357 RefPtr<Node> protectNode(n);
359 const AtomicString& localName = n->localName();
360 int tagPriority = n->isHTMLElement() ? static_cast<HTMLElement*>(n)->tagPriority() : 0;
362 // let's be stupid and just try to insert it.
363 // this should work if the document is well-formed
364 Node *newNode = current->addChild(n);
366 // don't push elements without end tags (e.g., <img>) on the stack
367 bool parentAttached = current->attached();
368 if (tagPriority > 0 && !flat) {
369 pushBlock(localName, tagPriority);
370 if (newNode == current)
374 if (parentAttached && !n->attached() && !m_fragment)
377 if (parentAttached && !n->attached() && !m_fragment)
384 return handleError(n, flat, localName, tagPriority); // Try to handle the error.
387 bool HTMLParser::handleError(Node* n, bool flat, const AtomicString& localName, int tagPriority)
389 // Error handling code. This is just ad hoc handling of specific parent/child combinations.
391 bool handled = false;
393 // 1. Check out the element's tag name to decide how to deal with errors.
394 if (n->isTextNode()) {
395 if (current->hasTagName(selectTag))
397 } else if (n->isHTMLElement()) {
398 HTMLElement* h = static_cast<HTMLElement*>(n);
399 if (h->hasLocalName(trTag) || h->hasLocalName(thTag) || h->hasLocalName(tdTag)) {
400 if (inStrayTableContent && !isTableRelated(current)) {
401 // pop out to the nearest enclosing table-related tag.
402 while (blockStack && !isTableRelated(current))
404 return insertNode(n);
406 } else if (h->hasLocalName(headTag)) {
407 if (!current->isDocumentNode() && !current->hasTagName(htmlTag))
409 } else if (h->hasLocalName(metaTag) || h->hasLocalName(linkTag) || h->hasLocalName(baseTag)) {
413 if (head->addChild(n)) {
414 if (!n->attached() && !m_fragment)
420 } else if (h->hasLocalName(htmlTag)) {
421 if (!current->isDocumentNode() ) {
422 if (doc()->firstChild()->hasTagName(htmlTag)) {
423 // we have another <HTML> element.... apply attributes to existing one
424 // make sure we don't overwrite already existing attributes
425 NamedAttrMap *map = static_cast<Element*>(n)->attributes(true);
426 Element *existingHTML = static_cast<Element*>(doc()->firstChild());
427 NamedAttrMap *bmap = existingHTML->attributes(false);
428 for (unsigned l = 0; map && l < map->length(); ++l) {
429 Attribute* it = map->attributeItem(l);
430 if (!bmap->getAttributeItem(it->name()))
431 existingHTML->setAttribute(it->name(), it->value());
436 } else if (h->hasLocalName(titleTag) || h->hasLocalName(styleTag)) {
440 Node* newNode = head->addChild(n);
442 setSkipMode(h->tagQName());
445 pushBlock(localName, tagPriority);
447 if (!n->attached() && !m_fragment)
452 setSkipMode(h->tagQName());
455 } else if (h->hasLocalName(bodyTag)) {
456 if (inBody && doc()->body()) {
457 // we have another <BODY> element.... apply attributes to existing one
458 // make sure we don't overwrite already existing attributes
459 // some sites use <body bgcolor=rightcolor>...<body bgcolor=wrongcolor>
460 NamedAttrMap *map = static_cast<Element*>(n)->attributes(true);
461 Element *existingBody = doc()->body();
462 NamedAttrMap *bmap = existingBody->attributes(false);
463 for (unsigned l = 0; map && l < map->length(); ++l) {
464 Attribute* it = map->attributeItem(l);
465 if (!bmap->getAttributeItem(it->name()))
466 existingBody->setAttribute(it->name(), it->value());
470 else if (!current->isDocumentNode())
472 } else if (h->hasLocalName(inputTag)) {
473 if (equalIgnoringCase(h->getAttribute(typeAttr), "hidden") && form) {
475 if (!n->attached() && !m_fragment)
479 } else if (h->hasLocalName(ddTag) || h->hasLocalName(dtTag)) {
480 e = new HTMLDListElement(document);
485 } else if (h->hasLocalName(areaTag)) {
488 if (!n->attached() && !m_fragment)
494 } else if (h->hasLocalName(captionTag)) {
495 if (isTablePart(current)) {
496 Node* tsection = current;
497 if (current->hasTagName(trTag))
498 tsection = current->parent();
499 else if (current->hasTagName(tdTag) || current->hasTagName(thTag))
500 tsection = current->parent()->parent();
501 Node* table = tsection->parent();
502 ExceptionCode ec = 0;
503 table->insertBefore(n, tsection, ec);
504 pushBlock(localName, tagPriority);
506 inStrayTableContent++;
507 blockStack->strayTableContent = true;
510 } else if (h->hasLocalName(theadTag) || h->hasLocalName(tbodyTag) ||
511 h->hasLocalName(tfootTag) || h->hasLocalName(colgroupTag)) {
512 if (isTableRelated(current)) {
513 while (blockStack && isTablePart(current))
515 return insertNode(n);
520 // 2. Next we examine our currently active element to do some further error handling.
521 if (current->isHTMLElement()) {
522 HTMLElement* h = static_cast<HTMLElement*>(current);
523 const AtomicString& currentTagName = current->localName();
524 if (h->hasLocalName(htmlTag)) {
525 HTMLElement* elt = n->isHTMLElement() ? static_cast<HTMLElement*>(n) : 0;
526 if (elt && (elt->hasLocalName(scriptTag) || elt->hasLocalName(styleTag) ||
527 elt->hasLocalName(metaTag) || elt->hasLocalName(linkTag) ||
528 elt->hasLocalName(objectTag) || elt->hasLocalName(embedTag) ||
529 elt->hasLocalName(titleTag) || elt->hasLocalName(isindexTag) ||
530 elt->hasLocalName(baseTag))) {
532 head = new HTMLHeadElement(document);
538 if (n->isTextNode()) {
539 Text *t = static_cast<Text *>(n);
540 if (t->containsOnlyWhitespace())
544 e = new HTMLBodyElement(document);
550 } else if (h->hasLocalName(headTag)) {
551 if (n->hasTagName(htmlTag))
554 // This means the body starts here...
556 popBlock(currentTagName);
557 e = new HTMLBodyElement(document);
563 } else if (h->hasLocalName(addressTag) || h->hasLocalName(dlTag) || h->hasLocalName(dtTag)
564 || h->hasLocalName(fontTag) || h->hasLocalName(styleTag) || h->hasLocalName(titleTag)) {
565 popBlock(currentTagName);
567 } else if (h->hasLocalName(captionTag)) {
568 // Illegal content in a caption. Close the caption and try again.
569 popBlock(currentTagName);
571 return insertNode(n, flat);
572 } else if (h->hasLocalName(tableTag) || h->hasLocalName(trTag) || isTableSection(h)) {
573 if (n->hasTagName(tableTag)) {
574 popBlock(localName); // end the table
575 handled = true; // ...and start a new one
577 bool possiblyMoveStrayContent = true;
578 ExceptionCode ec = 0;
579 if (n->isTextNode()) {
580 Text *t = static_cast<Text *>(n);
581 if (t->containsOnlyWhitespace())
583 StringImpl *i = t->string();
584 unsigned int pos = 0;
585 while (pos < i->length() && ((*i)[pos] == ' ' || (*i)[pos] == nonBreakingSpace))
587 if (pos == i->length())
588 possiblyMoveStrayContent = false;
590 if (possiblyMoveStrayContent) {
591 Node *node = current;
592 Node *parent = node->parentNode();
593 // A script may have removed the current node's parent from the DOM
594 // http://bugzilla.opendarwin.org/show_bug.cgi?id=7137
595 // FIXME: we should do real recovery here and re-parent with the correct node.
598 Node *grandparent = parent->parentNode();
600 if (n->isTextNode() ||
601 (h->hasLocalName(trTag) &&
602 isTableSection(parent) && grandparent->hasTagName(tableTag)) ||
603 ((!n->hasTagName(tdTag) && !n->hasTagName(thTag) &&
604 !n->hasTagName(formTag) && !n->hasTagName(scriptTag)) && isTableSection(node) &&
605 parent->hasTagName(tableTag))) {
606 node = (node->hasTagName(tableTag)) ? node :
607 ((node->hasTagName(trTag)) ? grandparent : parent);
608 Node *parent = node->parentNode();
611 parent->insertBefore(n, node, ec);
613 if (n->isHTMLElement() && tagPriority > 0 &&
614 !flat && static_cast<HTMLElement*>(n)->endTagRequirement() != TagStatusForbidden)
616 pushBlock(localName, tagPriority);
618 inStrayTableContent++;
619 blockStack->strayTableContent = true;
626 if (current->hasTagName(trTag))
627 e = new HTMLTableCellElement(tdTag, document);
628 else if (current->hasTagName(tableTag))
629 e = new HTMLTableSectionElement(tbodyTag, document, true); // implicit
631 e = new HTMLTableRowElement(document);
638 } else if (h->hasLocalName(objectTag)) {
639 setSkipMode(objectTag);
641 } else if (h->hasLocalName(ulTag) || h->hasLocalName(olTag) ||
642 h->hasLocalName(dirTag) || h->hasLocalName(menuTag)) {
643 e = new HTMLDivElement(document);
646 } else if (h->hasLocalName(selectTag)) {
649 } else if (h->hasLocalName(pTag) || isHeaderTag(currentTagName)) {
651 popBlock(currentTagName);
654 } else if (h->hasLocalName(optionTag) || h->hasLocalName(optgroupTag)) {
655 if (localName == optgroupTag) {
656 popBlock(currentTagName);
658 } else if (localName == selectTag) {
659 // IE treats a nested select as </select>. Let's do the same
662 } else if (h->hasLocalName(colgroupTag)) {
663 if (!n->isTextNode()) {
664 popBlock(currentTagName);
667 } else if (!h->hasLocalName(bodyTag)) {
668 if (isInline(current)) {
673 } else if (current->isDocumentNode()) {
674 if (current->firstChild() == 0 || !current->firstChild()->isHTMLElement()) {
675 e = new HTMLHtmlElement(document);
681 // 3. If we couldn't handle the error, just return false and attempt to error-correct again.
684 return insertNode(n);
687 typedef bool (HTMLParser::*CreateErrorCheckFunc)(Token* t, RefPtr<Node>&);
688 typedef HashMap<AtomicStringImpl*, CreateErrorCheckFunc> FunctionMap;
690 bool HTMLParser::textCreateErrorCheck(Token* t, RefPtr<Node>& result)
692 result = new Text(document, t->text.get());
696 bool HTMLParser::commentCreateErrorCheck(Token* t, RefPtr<Node>& result)
698 result = new Comment(document, t->text.get());
702 bool HTMLParser::headCreateErrorCheck(Token* t, RefPtr<Node>& result)
704 if (!head || current->localName() == htmlTag) {
705 head = new HTMLHeadElement(document);
711 bool HTMLParser::bodyCreateErrorCheck(Token* t, RefPtr<Node>& result)
713 // body no longer allowed if we have a frameset
721 bool HTMLParser::framesetCreateErrorCheck(Token* t, RefPtr<Node>& result)
724 if (inBody && !haveFrameSet && !haveContent) {
726 // ### actually for IE document.body returns the now hidden "body" element
727 // we can't implement that behaviour now because it could cause too many
728 // regressions and the headaches are not worth the work as long as there is
729 // no site actually relying on that detail (Dirk)
731 doc()->body()->setAttribute(styleAttr, "display:none");
734 if ((haveContent || haveFrameSet) && current->localName() == htmlTag)
741 bool HTMLParser::iframeCreateErrorCheck(Token* t, RefPtr<Node>& result)
743 // a bit of a special case, since the frame is inlined
744 setSkipMode(iframeTag);
748 bool HTMLParser::formCreateErrorCheck(Token* t, RefPtr<Node>& result)
750 // Only create a new form if we're not already inside one.
751 // This is consistent with other browsers' behavior.
753 form = new HTMLFormElement(document);
759 bool HTMLParser::isindexCreateErrorCheck(Token* t, RefPtr<Node>& result)
761 Node *n = handleIsindex(t);
771 bool HTMLParser::selectCreateErrorCheck(Token* t, RefPtr<Node>& result)
777 bool HTMLParser::ddCreateErrorCheck(Token* t, RefPtr<Node>& result)
784 bool HTMLParser::dtCreateErrorCheck(Token* t, RefPtr<Node>& result)
791 bool HTMLParser::nestedCreateErrorCheck(Token* t, RefPtr<Node>& result)
793 popBlock(t->tagName);
797 bool HTMLParser::nestedStyleCreateErrorCheck(Token* t, RefPtr<Node>& result)
799 return allowNestedRedundantTag(t->tagName);
802 bool HTMLParser::tableCellCreateErrorCheck(Token* t, RefPtr<Node>& result)
809 bool HTMLParser::tableSectionCreateErrorCheck(Token* t, RefPtr<Node>& result)
817 bool HTMLParser::noembedCreateErrorCheck(Token* t, RefPtr<Node>& result)
819 setSkipMode(noembedTag);
823 bool HTMLParser::noframesCreateErrorCheck(Token* t, RefPtr<Node>& result)
825 setSkipMode(noframesTag);
829 bool HTMLParser::noscriptCreateErrorCheck(Token* t, RefPtr<Node>& result)
831 if (!m_fragment && document->frame() && document->frame()->jScriptEnabled())
832 setSkipMode(noscriptTag);
836 bool HTMLParser::mapCreateErrorCheck(Token* t, RefPtr<Node>& result)
838 map = new HTMLMapElement(document);
843 bool HTMLParser::canvasCreateErrorCheck(Token* t, RefPtr<Node>& result)
845 if (!m_fragment && document->frame() && document->frame()->jScriptEnabled())
846 setSkipMode(canvasTag);
850 PassRefPtr<Node> HTMLParser::getNode(Token* t)
852 // Init our error handling table.
853 static FunctionMap gFunctionMap;
854 if (gFunctionMap.isEmpty()) {
855 gFunctionMap.set(aTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
856 gFunctionMap.set(bTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
857 gFunctionMap.set(bigTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
858 gFunctionMap.set(bodyTag.localName().impl(), &HTMLParser::bodyCreateErrorCheck);
859 gFunctionMap.set(buttonTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
860 gFunctionMap.set(canvasTag.localName().impl(), &HTMLParser::canvasCreateErrorCheck);
861 gFunctionMap.set(commentAtom.impl(), &HTMLParser::commentCreateErrorCheck);
862 gFunctionMap.set(ddTag.localName().impl(), &HTMLParser::ddCreateErrorCheck);
863 gFunctionMap.set(dtTag.localName().impl(), &HTMLParser::dtCreateErrorCheck);
864 gFunctionMap.set(formTag.localName().impl(), &HTMLParser::formCreateErrorCheck);
865 gFunctionMap.set(framesetTag.localName().impl(), &HTMLParser::framesetCreateErrorCheck);
866 gFunctionMap.set(headTag.localName().impl(), &HTMLParser::headCreateErrorCheck);
867 gFunctionMap.set(iTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
868 gFunctionMap.set(iframeTag.localName().impl(), &HTMLParser::iframeCreateErrorCheck);
869 gFunctionMap.set(isindexTag.localName().impl(), &HTMLParser::isindexCreateErrorCheck);
870 gFunctionMap.set(liTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
871 gFunctionMap.set(mapTag.localName().impl(), &HTMLParser::mapCreateErrorCheck);
872 gFunctionMap.set(nobrTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
873 gFunctionMap.set(noembedTag.localName().impl(), &HTMLParser::noembedCreateErrorCheck);
874 gFunctionMap.set(noframesTag.localName().impl(), &HTMLParser::noframesCreateErrorCheck);
875 gFunctionMap.set(noscriptTag.localName().impl(), &HTMLParser::noscriptCreateErrorCheck);
876 gFunctionMap.set(sTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
877 gFunctionMap.set(selectTag.localName().impl(), &HTMLParser::selectCreateErrorCheck);
878 gFunctionMap.set(smallTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
879 gFunctionMap.set(strikeTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
880 gFunctionMap.set(tbodyTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck);
881 gFunctionMap.set(tdTag.localName().impl(), &HTMLParser::tableCellCreateErrorCheck);
882 gFunctionMap.set(textAtom.impl(), &HTMLParser::textCreateErrorCheck);
883 gFunctionMap.set(tfootTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck);
884 gFunctionMap.set(thTag.localName().impl(), &HTMLParser::tableCellCreateErrorCheck);
885 gFunctionMap.set(theadTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck);
886 gFunctionMap.set(trTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
887 gFunctionMap.set(ttTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
888 gFunctionMap.set(uTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
889 gFunctionMap.set(wbrTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
894 if (CreateErrorCheckFunc errorCheckFunc = gFunctionMap.get(t->tagName.impl()))
895 proceed = (this->*errorCheckFunc)(t, result);
897 result = HTMLElementFactory::createHTMLElement(t->tagName, doc(), form);
898 return result.release();
901 #define MAX_REDUNDANT 20
903 bool HTMLParser::allowNestedRedundantTag(const AtomicString& _tagName)
905 // www.liceo.edu.mx is an example of a site that achieves a level of nesting of
906 // about 1500 tags, all from a bunch of <b>s. We will only allow at most 20
907 // nested tags of the same type before just ignoring them all together.
909 for (HTMLStackElem* curr = blockStack;
910 i < MAX_REDUNDANT && curr && curr->tagName == _tagName;
911 curr = curr->next, i++);
912 return i != MAX_REDUNDANT;
915 void HTMLParser::processCloseTag(Token *t)
917 // Support for really broken html.
918 // we never close the body tag, since some stupid web pages close it before the actual end of the doc.
919 // let's rely on the end() call to close things.
920 if (t->tagName == htmlTag || t->tagName == bodyTag)
923 if (t->tagName == formTag)
925 else if (t->tagName == mapTag)
927 else if (t->tagName == selectTag)
930 HTMLStackElem* oldElem = blockStack;
931 popBlock(t->tagName);
932 if (oldElem == blockStack && t->tagName == pTag) {
933 // We encountered a stray </p>. Amazingly Gecko, WinIE, and MacIE all treat
934 // this as a valid break, i.e., <p></p>. So go ahead and make the empty
938 popBlock(t->tagName);
942 bool HTMLParser::isHeaderTag(const AtomicString& tagName)
944 static HashSet<AtomicStringImpl*> headerTags;
945 if (headerTags.isEmpty()) {
946 headerTags.add(h1Tag.localName().impl());
947 headerTags.add(h2Tag.localName().impl());
948 headerTags.add(h3Tag.localName().impl());
949 headerTags.add(h4Tag.localName().impl());
950 headerTags.add(h5Tag.localName().impl());
951 headerTags.add(h6Tag.localName().impl());
954 return headerTags.contains(tagName.impl());
957 void HTMLParser::popNestedHeaderTag()
959 // This function only cares about checking for nested headers that have only inlines in between them.
960 Node* currNode = current;
961 for (HTMLStackElem* curr = blockStack; curr; curr = curr->next) {
962 if (isHeaderTag(curr->tagName)) {
963 popBlock(curr->tagName);
966 if (currNode && !isInline(currNode))
968 currNode = curr->node.get();
972 bool HTMLParser::isInline(Node* node) const
974 if (node->isTextNode())
977 if (node->isHTMLElement()) {
978 HTMLElement* e = static_cast<HTMLElement*>(node);
979 if (e->hasLocalName(aTag) || e->hasLocalName(fontTag) || e->hasLocalName(ttTag) ||
980 e->hasLocalName(uTag) || e->hasLocalName(bTag) || e->hasLocalName(iTag) ||
981 e->hasLocalName(sTag) || e->hasLocalName(strikeTag) || e->hasLocalName(bigTag) ||
982 e->hasLocalName(smallTag) || e->hasLocalName(emTag) || e->hasLocalName(strongTag) ||
983 e->hasLocalName(dfnTag) || e->hasLocalName(codeTag) || e->hasLocalName(sampTag) ||
984 e->hasLocalName(kbdTag) || e->hasLocalName(varTag) || e->hasLocalName(citeTag) ||
985 e->hasLocalName(abbrTag) || e->hasLocalName(acronymTag) || e->hasLocalName(subTag) ||
986 e->hasLocalName(supTag) || e->hasLocalName(spanTag) || e->hasLocalName(nobrTag) ||
987 e->hasLocalName(wbrTag) || e->hasLocalName(noframesTag) || e->hasLocalName(nolayerTag) ||
988 e->hasLocalName(noembedTag) || (e->hasLocalName(noscriptTag) && !m_fragment && document->frame() && document->frame()->jScriptEnabled()))
995 bool HTMLParser::isResidualStyleTag(const AtomicString& tagName)
997 static HashSet<AtomicStringImpl*> residualStyleTags;
998 if (residualStyleTags.isEmpty()) {
999 residualStyleTags.add(aTag.localName().impl());
1000 residualStyleTags.add(fontTag.localName().impl());
1001 residualStyleTags.add(ttTag.localName().impl());
1002 residualStyleTags.add(uTag.localName().impl());
1003 residualStyleTags.add(bTag.localName().impl());
1004 residualStyleTags.add(iTag.localName().impl());
1005 residualStyleTags.add(sTag.localName().impl());
1006 residualStyleTags.add(strikeTag.localName().impl());
1007 residualStyleTags.add(bigTag.localName().impl());
1008 residualStyleTags.add(smallTag.localName().impl());
1009 residualStyleTags.add(emTag.localName().impl());
1010 residualStyleTags.add(strongTag.localName().impl());
1011 residualStyleTags.add(dfnTag.localName().impl());
1012 residualStyleTags.add(codeTag.localName().impl());
1013 residualStyleTags.add(sampTag.localName().impl());
1014 residualStyleTags.add(kbdTag.localName().impl());
1015 residualStyleTags.add(varTag.localName().impl());
1016 residualStyleTags.add(nobrTag.localName().impl());
1017 residualStyleTags.add(wbrTag.localName().impl());
1020 return residualStyleTags.contains(tagName.impl());
1023 bool HTMLParser::isAffectedByResidualStyle(const AtomicString& tagName)
1025 if (isResidualStyleTag(tagName))
1028 static HashSet<AtomicStringImpl*> affectedBlockTags;
1029 if (affectedBlockTags.isEmpty()) {
1030 affectedBlockTags.add(addressTag.localName().impl());
1031 affectedBlockTags.add(blockquoteTag.localName().impl());
1032 affectedBlockTags.add(centerTag.localName().impl());
1033 affectedBlockTags.add(ddTag.localName().impl());
1034 affectedBlockTags.add(divTag.localName().impl());
1035 affectedBlockTags.add(dlTag.localName().impl());
1036 affectedBlockTags.add(dtTag.localName().impl());
1037 affectedBlockTags.add(formTag.localName().impl());
1038 affectedBlockTags.add(h1Tag.localName().impl());
1039 affectedBlockTags.add(h2Tag.localName().impl());
1040 affectedBlockTags.add(h3Tag.localName().impl());
1041 affectedBlockTags.add(h4Tag.localName().impl());
1042 affectedBlockTags.add(h5Tag.localName().impl());
1043 affectedBlockTags.add(h6Tag.localName().impl());
1044 affectedBlockTags.add(liTag.localName().impl());
1045 affectedBlockTags.add(listingTag.localName().impl());
1046 affectedBlockTags.add(olTag.localName().impl());
1047 affectedBlockTags.add(pTag.localName().impl());
1048 affectedBlockTags.add(preTag.localName().impl());
1049 affectedBlockTags.add(ulTag.localName().impl());
1052 return affectedBlockTags.contains(tagName.impl());
1055 void HTMLParser::handleResidualStyleCloseTagAcrossBlocks(HTMLStackElem* elem)
1057 // Find the element that crosses over to a higher level. For now, if there is more than
1058 // one, we will just give up and not attempt any sort of correction. It's highly unlikely that
1059 // there will be more than one, since <p> tags aren't allowed to be nested.
1060 ExceptionCode ec = 0;
1061 HTMLStackElem* curr = blockStack;
1062 HTMLStackElem* maxElem = 0;
1063 HTMLStackElem* prev = 0;
1064 HTMLStackElem* prevMaxElem = 0;
1065 while (curr && curr != elem) {
1066 if (curr->level > elem->level) {
1077 if (!curr || !maxElem || !isAffectedByResidualStyle(maxElem->tagName)) return;
1079 Node* residualElem = prev->node.get();
1080 Node* blockElem = prevMaxElem ? prevMaxElem->node.get() : current;
1081 Node* parentElem = elem->node.get();
1083 // Check to see if the reparenting that is going to occur is allowed according to the DOM.
1084 // FIXME: We should either always allow it or perform an additional fixup instead of
1085 // just bailing here.
1086 // Example: <p><font><center>blah</font></center></p> isn't doing a fixup right now.
1087 if (!parentElem->childAllowed(blockElem))
1090 if (maxElem->node->parentNode() != elem->node) {
1091 // Walk the stack and remove any elements that aren't residual style tags. These
1092 // are basically just being closed up. Example:
1093 // <font><span>Moo<p>Goo</font></p>.
1094 // In the above example, the <span> doesn't need to be reopened. It can just close.
1095 HTMLStackElem* currElem = maxElem->next;
1096 HTMLStackElem* prevElem = maxElem;
1097 while (currElem != elem) {
1098 HTMLStackElem* nextElem = currElem->next;
1099 if (!isResidualStyleTag(currElem->tagName)) {
1100 prevElem->next = nextElem;
1101 prevElem->node = currElem->node;
1105 prevElem = currElem;
1106 currElem = nextElem;
1109 // We have to reopen residual tags in between maxElem and elem. An example of this case is:
1110 // <font><i>Moo<p>Foo</font>.
1111 // In this case, we need to transform the part before the <p> into:
1112 // <font><i>Moo</i></font><i>
1113 // so that the <i> will remain open. This involves the modification of elements
1114 // in the block stack.
1115 // This will also affect how we ultimately reparent the block, since we want it to end up
1116 // under the reopened residual tags (e.g., the <i> in the above example.)
1117 RefPtr<Node> prevNode = 0;
1119 while (currElem->node != residualElem) {
1120 if (isResidualStyleTag(currElem->node->localName())) {
1121 // Create a clone of this element.
1122 RefPtr<Node> currNode = currElem->node->cloneNode(false);
1124 // Change the stack element's node to point to the clone.
1125 currElem->node = currNode;
1127 // Attach the previous node as a child of this new node.
1129 currNode->appendChild(prevNode, ec);
1130 else // The new parent for the block element is going to be the innermost clone.
1131 parentElem = currNode.get();
1133 prevNode = currNode.get();
1136 currElem = currElem->next;
1139 // Now append the chain of new residual style elements if one exists.
1141 elem->node->appendChild(prevNode, ec);
1144 // Check if the block is still in the tree. If it isn't, then we don't
1145 // want to remove it from its parent (that would crash) or insert it into
1146 // a new parent later. See http://bugzilla.opendarwin.org/show_bug.cgi?id=6778
1147 bool isBlockStillInTree = blockElem->parentNode();
1149 // We need to make a clone of |residualElem| and place it just inside |blockElem|.
1150 // All content of |blockElem| is reparented to be under this clone. We then
1151 // reparent |blockElem| using real DOM calls so that attachment/detachment will
1152 // be performed to fix up the rendering tree.
1153 // So for this example: <b>...<p>Foo</b>Goo</p>
1154 // The end result will be: <b>...</b><p><b>Foo</b>Goo</p>
1156 // Step 1: Remove |blockElem| from its parent, doing a batch detach of all the kids.
1158 form->setPreserveAcrossRemove(true);
1159 if (isBlockStillInTree)
1160 blockElem->parentNode()->removeChild(blockElem, ec);
1162 // Step 2: Clone |residualElem|.
1163 RefPtr<Node> newNode = residualElem->cloneNode(false); // Shallow clone. We don't pick up the same kids.
1165 // Step 3: Place |blockElem|'s children under |newNode|. Remove all of the children of |blockElem|
1166 // before we've put |newElem| into the document. That way we'll only do one attachment of all
1167 // the new content (instead of a bunch of individual attachments).
1168 Node* currNode = blockElem->firstChild();
1170 Node* nextNode = currNode->nextSibling();
1171 newNode->appendChild(currNode, ec);
1172 currNode = nextNode;
1175 // Step 4: Place |newNode| under |blockElem|. |blockElem| is still out of the document, so no
1176 // attachment can occur yet.
1177 blockElem->appendChild(newNode.release(), ec);
1179 // Step 5: Reparent |blockElem|. Now the full attachment of the fixed up tree takes place.
1180 if (isBlockStillInTree)
1181 parentElem->appendChild(blockElem, ec);
1183 // Step 6: Elide |elem|, since it is effectively no longer open. Also update
1184 // the node associated with the previous stack element so that when it gets popped,
1185 // it doesn't make the residual element the next current node.
1186 HTMLStackElem* currElem = maxElem;
1187 HTMLStackElem* prevElem = 0;
1188 while (currElem != elem) {
1189 prevElem = currElem;
1190 currElem = currElem->next;
1192 prevElem->next = elem->next;
1193 prevElem->node = elem->node;
1196 // Step 7: Reopen intermediate inlines, e.g., <b><p><i>Foo</b>Goo</p>.
1197 // In the above example, Goo should stay italic.
1199 HTMLStackElem* residualStyleStack = 0;
1200 while (curr && curr != maxElem) {
1201 // We will actually schedule this tag for reopening
1202 // after we complete the close of this entire block.
1203 Node* currNode = current;
1204 if (isResidualStyleTag(curr->tagName)) {
1205 // We've overloaded the use of stack elements and are just reusing the
1206 // struct with a slightly different meaning to the variables. Instead of chaining
1207 // from innermost to outermost, we build up a list of all the tags we need to reopen
1208 // from the outermost to the innermost, i.e., residualStyleStack will end up pointing
1209 // to the outermost tag we need to reopen.
1210 // We also set curr->node to be the actual element that corresponds to the ID stored in
1211 // curr->id rather than the node that you should pop to when the element gets pulled off
1214 curr->node = currNode;
1215 curr->next = residualStyleStack;
1216 residualStyleStack = curr;
1224 reopenResidualStyleTags(residualStyleStack, 0); // FIXME: Deal with stray table content some day
1225 // if it becomes necessary to do so.
1228 form->setPreserveAcrossRemove(false);
1231 void HTMLParser::reopenResidualStyleTags(HTMLStackElem* elem, Node* malformedTableParent)
1233 // Loop for each tag that needs to be reopened.
1235 // Create a shallow clone of the DOM node for this element.
1236 RefPtr<Node> newNode = elem->node->cloneNode(false);
1238 // Append the new node. In the malformed table case, we need to insert before the table,
1239 // which will be the last child.
1240 ExceptionCode ec = 0;
1241 if (malformedTableParent)
1242 malformedTableParent->insertBefore(newNode, malformedTableParent->lastChild(), ec);
1244 current->appendChild(newNode, ec);
1245 // FIXME: Is it really OK to ignore the exceptions here?
1247 // Now push a new stack element for this node we just created.
1248 pushBlock(elem->tagName, elem->level);
1250 // Set our strayTableContent boolean if needed, so that the reopened tag also knows
1251 // that it is inside a malformed table.
1252 blockStack->strayTableContent = malformedTableParent != 0;
1253 if (blockStack->strayTableContent)
1254 inStrayTableContent++;
1256 // Clear our malformed table parent variable.
1257 malformedTableParent = 0;
1259 // Update |current| manually to point to the new node.
1260 setCurrent(newNode.get());
1262 // Advance to the next tag that needs to be reopened.
1263 HTMLStackElem* next = elem->next;
1269 void HTMLParser::pushBlock(const AtomicString& tagName, int _level)
1271 HTMLStackElem *Elem = new HTMLStackElem(tagName, _level, current, blockStack);
1275 void HTMLParser::popBlock(const AtomicString& _tagName)
1277 HTMLStackElem *Elem = blockStack;
1281 while (Elem && (Elem->tagName != _tagName)) {
1282 if (maxLevel < Elem->level)
1283 maxLevel = Elem->level;
1290 if (maxLevel > Elem->level) {
1291 // We didn't match because the tag is in a different scope, e.g.,
1292 // <b><p>Foo</b>. Try to correct the problem.
1293 if (!isResidualStyleTag(_tagName))
1295 return handleResidualStyleCloseTagAcrossBlocks(Elem);
1298 bool isAffectedByStyle = isAffectedByResidualStyle(Elem->tagName);
1299 HTMLStackElem* residualStyleStack = 0;
1300 Node* malformedTableParent = 0;
1304 if (Elem->tagName == _tagName) {
1305 int strayTable = inStrayTableContent;
1309 // This element was the root of some malformed content just inside an implicit or
1310 // explicit <tbody> or <tr>.
1311 // If we end up needing to reopen residual style tags, the root of the reopened chain
1312 // must also know that it is the root of malformed content inside a <tbody>/<tr>.
1313 if (strayTable && (inStrayTableContent < strayTable) && residualStyleStack) {
1314 Node* curr = current;
1315 while (curr && !curr->hasTagName(tableTag))
1316 curr = curr->parentNode();
1317 malformedTableParent = curr ? curr->parentNode() : 0;
1321 if (form && Elem->tagName == formTag)
1322 // A <form> is being closed prematurely (and this is
1323 // malformed HTML). Set an attribute on the form to clear out its
1325 form->setMalformed(true);
1327 // Schedule this tag for reopening
1328 // after we complete the close of this entire block.
1329 Node* currNode = current;
1330 if (isAffectedByStyle && isResidualStyleTag(Elem->tagName)) {
1331 // We've overloaded the use of stack elements and are just reusing the
1332 // struct with a slightly different meaning to the variables. Instead of chaining
1333 // from innermost to outermost, we build up a list of all the tags we need to reopen
1334 // from the outermost to the innermost, i.e., residualStyleStack will end up pointing
1335 // to the outermost tag we need to reopen.
1336 // We also set Elem->node to be the actual element that corresponds to the ID stored in
1337 // Elem->id rather than the node that you should pop to when the element gets pulled off
1340 Elem->next = residualStyleStack;
1341 Elem->node = currNode;
1342 residualStyleStack = Elem;
1350 reopenResidualStyleTags(residualStyleStack, malformedTableParent);
1353 void HTMLParser::popOneBlock(bool delBlock)
1355 HTMLStackElem* elem = blockStack;
1357 // Form elements restore their state during the parsing process.
1358 // Also, a few elements (<applet>, <object>) need to know when all child elements (<param>s) are available.
1359 if (current && elem->node != current)
1360 current->closeRenderer();
1362 blockStack = elem->next;
1363 setCurrent(elem->node.get());
1365 if (elem->strayTableContent)
1366 inStrayTableContent--;
1372 void HTMLParser::popInlineBlocks()
1374 while (blockStack && isInline(current))
1378 void HTMLParser::freeBlock()
1384 void HTMLParser::createHead()
1386 if (head || !doc()->firstChild())
1389 head = new HTMLHeadElement(document);
1390 HTMLElement* body = doc()->body();
1391 ExceptionCode ec = 0;
1392 doc()->firstChild()->insertBefore(head, body, ec);
1397 Node* HTMLParser::handleIsindex(Token* t)
1399 Node* n = new HTMLDivElement(document);
1401 NamedMappedAttrMap* attrs = t->attrs.get();
1403 RefPtr<HTMLIsIndexElement> isIndex = new HTMLIsIndexElement(document, form);
1404 isIndex->setAttributeMap(attrs);
1405 isIndex->setAttribute(typeAttr, "khtml_isindex");
1407 String text = searchableIndexIntroduction();
1409 if (Attribute *a = attrs->getAttributeItem(promptAttr))
1410 text = a->value().domString() + " ";
1414 n->addChild(new HTMLHRElement(document));
1415 n->addChild(new Text(document, text));
1416 n->addChild(isIndex.get());
1417 n->addChild(new HTMLHRElement(document));
1422 void HTMLParser::startBody()
1429 insertNode(isindex.get(), true /* don't decend into this node */);
1434 void HTMLParser::finished()
1436 // In the case of a completely empty document, here's the place to create the HTML element.
1437 if (current && current->isDocumentNode() && !current->firstChild())
1438 insertNode(new HTMLHtmlElement(document));
1440 // This ensures that "current" is not left pointing to a node when the document is destroyed.
1444 // Warning, this may delete the tokenizer and parser, so don't try to do anything else after this.
1446 document->finishedParsing();