2 This file is part of the KDE libraries
4 Copyright (C) 1997 Martin Jones (mjones@kde.org)
5 (C) 1997 Torben Weis (weis@kde.org)
6 (C) 1999,2001 Lars Knoll (knoll@kde.org)
7 (C) 2000,2001 Dirk Mueller (mueller@kde.org)
8 Copyright (C) 2004, 2005, 2006 Apple Computer, Inc.
10 This library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Library General Public
12 License as published by the Free Software Foundation; either
13 version 2 of the License, or (at your option) any later version.
15 This library is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Library General Public License for more details.
20 You should have received a copy of the GNU Library General Public License
21 along with this library; see the file COPYING.LIB. If not, write to
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 Boston, MA 02111-1307, USA.
25 //----------------------------------------------------------------------------
27 // KDE HTML Widget -- HTML Parser
30 #include "htmlparser.h"
32 #include "DocumentFragmentImpl.h"
34 #include "FrameView.h"
35 #include "HTMLFormElementImpl.h"
36 #include "HTMLIsIndexElementImpl.h"
37 #include "LocalizedStrings.h"
38 #include "cssproperties.h"
39 #include "cssvalues.h"
41 #include "CommentImpl.h"
42 #include "html_baseimpl.h"
43 #include "html_blockimpl.h"
44 #include "html_canvasimpl.h"
45 #include "html_documentimpl.h"
46 #include "html_headimpl.h"
47 #include "html_imageimpl.h"
48 #include "html_inlineimpl.h"
49 #include "html_listimpl.h"
50 #include "html_objectimpl.h"
51 #include "html_tableimpl.h"
52 #include "htmlfactory.h"
53 #include "htmltokenizer.h"
54 #include "render_object.h"
55 #include <kxmlcore/HashMap.h>
56 #include <kxmlcore/HashSet.h>
60 using namespace HTMLNames;
68 HTMLStackElem(const AtomicString& _tagName,
76 strayTableContent(false),
83 bool strayTableContent;
84 RefPtr<NodeImpl> node;
91 * The parser parses tokenized input into the document, building up the
92 * document tree. If the document is wellformed, parsing it is
94 * Unfortunately, people can't write wellformed HTML documents, so the parser
95 * has to be tolerant about errors.
97 * We have to take care of the following error conditions:
98 * 1. The element being added is explicitly forbidden inside some outer tag.
99 * In this case we should close all tags up to the one, which forbids
100 * the element, and add it afterwards.
101 * 2. We are not allowed to add the element directly. It could be, that
102 * the person writing the document forgot some tag inbetween (or that the
103 * tag inbetween is optional...) This could be the case with the following
104 * tags: HTML HEAD BODY TBODY TR TD LI (did I forget any?)
105 * 3. We wan't to add a block element inside to an inline element. Close all
106 * inline elements up to the next higher block element.
107 * 4. If this doesn't help close elements, until we are allowed to add the
108 * element or ignore the tag.
111 HTMLParser::HTMLParser(DocumentImpl* doc)
114 , currentIsReferenced(false)
121 HTMLParser::HTMLParser(DocumentFragmentImpl* frag)
122 : document(frag->getDocument())
124 , currentIsReferenced(false)
133 HTMLParser::~HTMLParser()
140 void HTMLParser::reset()
147 haveFrameSet = false;
150 inStrayTableContent = 0;
158 discard_until = nullAtom;
161 void HTMLParser::setCurrent(NodeImpl *newCurrent)
163 bool newCurrentIsReferenced = newCurrent && newCurrent != doc();
164 if (newCurrentIsReferenced)
166 if (currentIsReferenced)
168 current = newCurrent;
169 currentIsReferenced = newCurrentIsReferenced;
172 PassRefPtr<NodeImpl> HTMLParser::parseToken(Token *t)
174 if (!discard_until.isNull()) {
175 if (t->tagName == discard_until && !t->beginTag)
176 discard_until = nullAtom;
178 // do not skip </iframe>
179 if (!discard_until.isNull() || (current->localName() != t->tagName))
183 // Apparently some sites use </br> instead of <br>. Be compatible with IE and Firefox and treat this like <br>.
184 if (t->isCloseTag(brTag) && doc()->inCompatMode())
192 // ignore spaces, if we're not inside a paragraph or other inline code
193 if (t->tagName == textAtom && t->text) {
194 if (inBody && !skipMode() && current->localName() != styleTag && current->localName() != titleTag &&
195 current->localName() != scriptTag && !t->text->containsOnlyWhitespace())
199 RefPtr<NodeImpl> n = getNode(t);
200 // just to be sure, and to catch currently unimplemented stuff
205 if (n->isHTMLElement()) {
206 HTMLElementImpl* e = static_cast<HTMLElementImpl*>(n.get());
207 e->setAttributeMap(t->attrs.get());
209 // take care of optional close tags
210 if (e->endTagRequirement() == TagStatusOptional)
211 popBlock(t->tagName);
213 if (isHeaderTag(t->tagName))
214 // Do not allow two header tags to be nested if the intervening tags are inlines.
215 popNestedHeaderTag();
218 if (!insertNode(n.get(), t->flat)) {
219 // we couldn't insert the node
221 if (n->isElementNode()) {
222 ElementImpl* e = static_cast<ElementImpl*>(n.get());
223 e->setAttributeMap(0);
240 static bool isTableSection(NodeImpl* n)
242 return n->hasTagName(tbodyTag) || n->hasTagName(tfootTag) || n->hasTagName(theadTag);
245 static bool isTablePart(NodeImpl* n)
247 return n->hasTagName(trTag) || n->hasTagName(tdTag) || n->hasTagName(thTag) ||
251 static bool isTableRelated(NodeImpl* n)
253 return n->hasTagName(tableTag) || isTablePart(n);
256 bool HTMLParser::insertNode(NodeImpl *n, bool flat)
258 RefPtr<NodeImpl> protectNode(n);
260 const AtomicString& localName = n->localName();
261 int tagPriority = n->isHTMLElement() ? static_cast<HTMLElementImpl*>(n)->tagPriority() : 0;
263 // let's be stupid and just try to insert it.
264 // this should work if the document is well-formed
265 NodeImpl *newNode = current->addChild(n);
267 // don't push elements without end tags (e.g., <img>) on the stack
268 bool parentAttached = current->attached();
269 if (tagPriority > 0 && !flat) {
270 pushBlock(localName, tagPriority);
271 if (newNode == current)
275 if (parentAttached && !n->attached() && !m_fragment)
278 if (parentAttached && !n->attached() && !m_fragment)
280 if (n->maintainsState()) {
281 doc()->registerMaintainsState(n);
282 QStringList &states = doc()->restoreState();
283 if (!states.isEmpty())
284 n->restoreState(states);
291 return handleError(n, flat, localName, tagPriority); // Try to handle the error.
294 bool HTMLParser::handleError(NodeImpl* n, bool flat, const AtomicString& localName, int tagPriority)
296 // Error handling code. This is just ad hoc handling of specific parent/child combinations.
298 bool handled = false;
300 // 1. Check out the element's tag name to decide how to deal with errors.
301 if (n->isTextNode()) {
302 if (current->hasTagName(selectTag))
304 } else if (n->isHTMLElement()) {
305 HTMLElementImpl* h = static_cast<HTMLElementImpl*>(n);
306 if (h->hasLocalName(trTag) || h->hasLocalName(thTag) || h->hasLocalName(tdTag)) {
307 if (inStrayTableContent && !isTableRelated(current)) {
308 // pop out to the nearest enclosing table-related tag.
309 while (blockStack && !isTableRelated(current))
311 return insertNode(n);
313 } else if (h->hasLocalName(headTag)) {
314 if (!current->isDocumentNode() && !current->hasTagName(htmlTag))
316 } else if (h->hasLocalName(metaTag) || h->hasLocalName(linkTag) || h->hasLocalName(baseTag)) {
320 if (head->addChild(n)) {
321 if (!n->attached() && !m_fragment)
327 } else if (h->hasLocalName(htmlTag)) {
328 if (!current->isDocumentNode() ) {
329 if (doc()->firstChild()->hasTagName(htmlTag)) {
330 // we have another <HTML> element.... apply attributes to existing one
331 // make sure we don't overwrite already existing attributes
332 NamedAttrMapImpl *map = static_cast<ElementImpl*>(n)->attributes(true);
333 ElementImpl *existingHTML = static_cast<ElementImpl*>(doc()->firstChild());
334 NamedAttrMapImpl *bmap = existingHTML->attributes(false);
335 for (unsigned l = 0; map && l < map->length(); ++l) {
336 AttributeImpl* it = map->attributeItem(l);
337 if (!bmap->getAttributeItem(it->name()))
338 existingHTML->setAttribute(it->name(), it->value());
343 } else if (h->hasLocalName(titleTag) || h->hasLocalName(styleTag)) {
347 NodeImpl *newNode = head->addChild(n);
349 pushBlock(localName, tagPriority);
351 if (!n->attached() && !m_fragment)
354 setSkipMode(styleTag);
359 setSkipMode(styleTag);
362 } else if (h->hasLocalName(bodyTag)) {
363 if (inBody && doc()->body()) {
364 // we have another <BODY> element.... apply attributes to existing one
365 // make sure we don't overwrite already existing attributes
366 // some sites use <body bgcolor=rightcolor>...<body bgcolor=wrongcolor>
367 NamedAttrMapImpl *map = static_cast<ElementImpl*>(n)->attributes(true);
368 ElementImpl *existingBody = doc()->body();
369 NamedAttrMapImpl *bmap = existingBody->attributes(false);
370 for (unsigned l = 0; map && l < map->length(); ++l) {
371 AttributeImpl* it = map->attributeItem(l);
372 if (!bmap->getAttributeItem(it->name()))
373 existingBody->setAttribute(it->name(), it->value());
377 else if (!current->isDocumentNode())
379 } else if (h->hasLocalName(inputTag)) {
380 if (equalIgnoringCase(h->getAttribute(typeAttr), "hidden") && form) {
382 if (!n->attached() && !m_fragment)
386 } else if (h->hasLocalName(ddTag) || h->hasLocalName(dtTag)) {
387 e = new HTMLDListElementImpl(document);
392 } else if (h->hasLocalName(areaTag)) {
395 if (!n->attached() && !m_fragment)
401 } else if (h->hasLocalName(captionTag)) {
402 if (isTablePart(current)) {
403 NodeImpl* tsection = current;
404 if (current->hasTagName(trTag))
405 tsection = current->parent();
406 else if (current->hasTagName(tdTag) || current->hasTagName(thTag))
407 tsection = current->parent()->parent();
408 NodeImpl* table = tsection->parent();
409 ExceptionCode ec = 0;
410 table->insertBefore(n, tsection, ec);
411 pushBlock(localName, tagPriority);
413 inStrayTableContent++;
414 blockStack->strayTableContent = true;
417 } else if (h->hasLocalName(theadTag) || h->hasLocalName(tbodyTag) ||
418 h->hasLocalName(tfootTag) || h->hasLocalName(colgroupTag)) {
419 if (isTableRelated(current)) {
420 while (blockStack && isTablePart(current))
422 return insertNode(n);
427 // 2. Next we examine our currently active element to do some further error handling.
428 if (current->isHTMLElement()) {
429 HTMLElementImpl* h = static_cast<HTMLElementImpl*>(current);
430 const AtomicString& currentTagName = current->localName();
431 if (h->hasLocalName(htmlTag)) {
432 HTMLElementImpl* elt = n->isHTMLElement() ? static_cast<HTMLElementImpl*>(n) : 0;
433 if (elt && (elt->hasLocalName(scriptTag) || elt->hasLocalName(styleTag) ||
434 elt->hasLocalName(metaTag) || elt->hasLocalName(linkTag) ||
435 elt->hasLocalName(objectTag) || elt->hasLocalName(embedTag) ||
436 elt->hasLocalName(titleTag) || elt->hasLocalName(isindexTag) ||
437 elt->hasLocalName(baseTag))) {
439 head = new HTMLHeadElementImpl(document);
445 if (n->isTextNode()) {
446 TextImpl *t = static_cast<TextImpl *>(n);
447 if (t->containsOnlyWhitespace())
451 e = new HTMLBodyElementImpl(document);
457 } else if (h->hasLocalName(headTag)) {
458 if (n->hasTagName(htmlTag))
461 // This means the body starts here...
463 popBlock(currentTagName);
464 e = new HTMLBodyElementImpl(document);
470 } else if (h->hasLocalName(addressTag) || h->hasLocalName(dlTag) || h->hasLocalName(dtTag)
471 || h->hasLocalName(fontTag) || h->hasLocalName(titleTag)) {
472 popBlock(currentTagName);
474 } else if (h->hasLocalName(captionTag)) {
475 // Illegal content in a caption. Close the caption and try again.
476 popBlock(currentTagName);
478 return insertNode(n, flat);
479 } else if (h->hasLocalName(tableTag) || h->hasLocalName(trTag) || isTableSection(h)) {
480 if (n->hasTagName(tableTag)) {
481 popBlock(localName); // end the table
482 handled = true; // ...and start a new one
484 bool possiblyMoveStrayContent = true;
485 ExceptionCode ec = 0;
486 if (n->isTextNode()) {
487 TextImpl *t = static_cast<TextImpl *>(n);
488 if (t->containsOnlyWhitespace())
490 DOMStringImpl *i = t->string();
491 unsigned int pos = 0;
492 while (pos < i->length() && ((*i)[pos] == ' ' || (*i)[pos] == QChar(0xa0)))
494 if (pos == i->length())
495 possiblyMoveStrayContent = false;
497 if (possiblyMoveStrayContent) {
498 NodeImpl *node = current;
499 NodeImpl *parent = node->parentNode();
500 NodeImpl *grandparent = parent->parentNode();
502 if (n->isTextNode() ||
503 (h->hasLocalName(trTag) &&
504 isTableSection(parent) && grandparent->hasTagName(tableTag)) ||
505 ((!n->hasTagName(tdTag) && !n->hasTagName(thTag) &&
506 !n->hasTagName(formTag) && !n->hasTagName(scriptTag)) && isTableSection(node) &&
507 parent->hasTagName(tableTag))) {
508 node = (node->hasTagName(tableTag)) ? node :
509 ((node->hasTagName(trTag)) ? grandparent : parent);
510 NodeImpl *parent = node->parentNode();
511 parent->insertBefore(n, node, ec);
513 if (n->isHTMLElement() && tagPriority > 0 &&
514 !flat && static_cast<HTMLElementImpl*>(n)->endTagRequirement() != TagStatusForbidden)
516 pushBlock(localName, tagPriority);
518 inStrayTableContent++;
519 blockStack->strayTableContent = true;
526 if (current->hasTagName(trTag))
527 e = new HTMLTableCellElementImpl(tdTag, document);
528 else if (current->hasTagName(tableTag))
529 e = new HTMLTableSectionElementImpl(tbodyTag, document, true); // implicit
531 e = new HTMLTableRowElementImpl(document);
538 } else if (h->hasLocalName(objectTag)) {
539 setSkipMode(objectTag);
541 } else if (h->hasLocalName(ulTag) || h->hasLocalName(olTag) ||
542 h->hasLocalName(dirTag) || h->hasLocalName(menuTag)) {
543 e = new HTMLDivElementImpl(document);
546 } else if (h->hasLocalName(selectTag)) {
549 } else if (h->hasLocalName(pTag) || isHeaderTag(currentTagName)) {
551 popBlock(currentTagName);
554 } else if (h->hasLocalName(optionTag) || h->hasLocalName(optgroupTag)) {
555 if (localName == optgroupTag) {
556 popBlock(currentTagName);
558 } else if (localName == selectTag) {
559 // IE treats a nested select as </select>. Let's do the same
562 } else if (h->hasLocalName(colgroupTag)) {
563 if (!n->isTextNode()) {
564 popBlock(currentTagName);
567 } else if (!h->hasLocalName(bodyTag)) {
568 if (isInline(current)) {
573 } else if (current->isDocumentNode()) {
574 if (current->firstChild() == 0) {
575 e = new HTMLHtmlElementImpl(document);
581 // 3. If we couldn't handle the error, just return false and attempt to error-correct again.
584 return insertNode(n);
587 typedef bool (HTMLParser::*CreateErrorCheckFunc)(Token* t, RefPtr<NodeImpl>&);
588 typedef HashMap<AtomicStringImpl*, CreateErrorCheckFunc> FunctionMap;
590 bool HTMLParser::textCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
592 result = new TextImpl(document, t->text.get());
596 bool HTMLParser::commentCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
598 result = new CommentImpl(document, t->text.get());
602 bool HTMLParser::headCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
604 if (!head || current->localName() == htmlTag) {
605 head = new HTMLHeadElementImpl(document);
611 bool HTMLParser::bodyCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
613 // body no longer allowed if we have a frameset
621 bool HTMLParser::framesetCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
624 if (inBody && !haveFrameSet && !haveContent) {
626 // ### actually for IE document.body returns the now hidden "body" element
627 // we can't implement that behaviour now because it could cause too many
628 // regressions and the headaches are not worth the work as long as there is
629 // no site actually relying on that detail (Dirk)
631 doc()->body()->setAttribute(styleAttr, "display:none");
634 if ((haveContent || haveFrameSet) && current->localName() == htmlTag)
641 bool HTMLParser::iframeCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
643 // a bit of a special case, since the frame is inlined
644 setSkipMode(iframeTag);
648 bool HTMLParser::formCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
650 // Only create a new form if we're not already inside one.
651 // This is consistent with other browsers' behavior.
653 form = new HTMLFormElementImpl(document);
659 bool HTMLParser::isindexCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
661 NodeImpl *n = handleIsindex(t);
671 bool HTMLParser::selectCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
677 bool HTMLParser::ddCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
684 bool HTMLParser::dtCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
691 bool HTMLParser::nestedCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
693 popBlock(t->tagName);
697 bool HTMLParser::nestedStyleCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
699 return allowNestedRedundantTag(t->tagName);
702 bool HTMLParser::tableCellCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
709 bool HTMLParser::tableSectionCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
717 bool HTMLParser::noembedCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
719 setSkipMode(noembedTag);
723 bool HTMLParser::noframesCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
725 setSkipMode(noframesTag);
729 bool HTMLParser::noscriptCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
731 if (!m_fragment && document->frame()->jScriptEnabled())
732 setSkipMode(noscriptTag);
736 bool HTMLParser::mapCreateErrorCheck(Token* t, RefPtr<NodeImpl>& result)
738 map = new HTMLMapElementImpl(document);
743 PassRefPtr<NodeImpl> HTMLParser::getNode(Token* t)
745 // Init our error handling table.
746 static FunctionMap gFunctionMap;
747 if (gFunctionMap.isEmpty()) {
748 gFunctionMap.set(textAtom.impl(), &HTMLParser::textCreateErrorCheck);
749 gFunctionMap.set(commentAtom.impl(), &HTMLParser::commentCreateErrorCheck);
750 gFunctionMap.set(headTag.localName().impl(), &HTMLParser::headCreateErrorCheck);
751 gFunctionMap.set(bodyTag.localName().impl(), &HTMLParser::bodyCreateErrorCheck);
752 gFunctionMap.set(framesetTag.localName().impl(), &HTMLParser::framesetCreateErrorCheck);
753 gFunctionMap.set(iframeTag.localName().impl(), &HTMLParser::iframeCreateErrorCheck);
754 gFunctionMap.set(formTag.localName().impl(), &HTMLParser::formCreateErrorCheck);
755 gFunctionMap.set(isindexTag.localName().impl(), &HTMLParser::isindexCreateErrorCheck);
756 gFunctionMap.set(selectTag.localName().impl(), &HTMLParser::selectCreateErrorCheck);
757 gFunctionMap.set(ddTag.localName().impl(), &HTMLParser::ddCreateErrorCheck);
758 gFunctionMap.set(dtTag.localName().impl(), &HTMLParser::dtCreateErrorCheck);
759 gFunctionMap.set(liTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
760 gFunctionMap.set(aTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
761 gFunctionMap.set(buttonTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
762 gFunctionMap.set(nobrTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
763 gFunctionMap.set(wbrTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
764 gFunctionMap.set(trTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck);
765 gFunctionMap.set(tdTag.localName().impl(), &HTMLParser::tableCellCreateErrorCheck);
766 gFunctionMap.set(thTag.localName().impl(), &HTMLParser::tableCellCreateErrorCheck);
767 gFunctionMap.set(tbodyTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck);
768 gFunctionMap.set(theadTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck);
769 gFunctionMap.set(tfootTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck);
770 gFunctionMap.set(ttTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
771 gFunctionMap.set(uTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
772 gFunctionMap.set(bTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
773 gFunctionMap.set(iTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
774 gFunctionMap.set(sTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
775 gFunctionMap.set(strikeTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
776 gFunctionMap.set(bigTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
777 gFunctionMap.set(smallTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck);
778 gFunctionMap.set(noembedTag.localName().impl(), &HTMLParser::noembedCreateErrorCheck);
779 gFunctionMap.set(noframesTag.localName().impl(), &HTMLParser::noframesCreateErrorCheck);
780 gFunctionMap.set(noscriptTag.localName().impl(), &HTMLParser::noscriptCreateErrorCheck);
781 gFunctionMap.set(mapTag.localName().impl(), &HTMLParser::mapCreateErrorCheck);
785 RefPtr<NodeImpl> result;
786 if (CreateErrorCheckFunc errorCheckFunc = gFunctionMap.get(t->tagName.impl()))
787 proceed = (this->*errorCheckFunc)(t, result);
789 result = HTMLElementFactory::createHTMLElement(t->tagName, doc(), form);
790 return result.release();
793 #define MAX_REDUNDANT 20
795 bool HTMLParser::allowNestedRedundantTag(const AtomicString& _tagName)
797 // www.liceo.edu.mx is an example of a site that achieves a level of nesting of
798 // about 1500 tags, all from a bunch of <b>s. We will only allow at most 20
799 // nested tags of the same type before just ignoring them all together.
801 for (HTMLStackElem* curr = blockStack;
802 i < MAX_REDUNDANT && curr && curr->tagName == _tagName;
803 curr = curr->next, i++);
804 return i != MAX_REDUNDANT;
807 void HTMLParser::processCloseTag(Token *t)
809 // Support for really broken html.
810 // we never close the body tag, since some stupid web pages close it before the actual end of the doc.
811 // let's rely on the end() call to close things.
812 if (t->tagName == htmlTag || t->tagName == bodyTag)
815 if (t->tagName == formTag)
817 else if (t->tagName == mapTag)
819 else if (t->tagName == selectTag)
822 HTMLStackElem* oldElem = blockStack;
823 popBlock(t->tagName);
824 if (oldElem == blockStack && t->tagName == pTag) {
825 // We encountered a stray </p>. Amazingly Gecko, WinIE, and MacIE all treat
826 // this as a valid break, i.e., <p></p>. So go ahead and make the empty
830 popBlock(t->tagName);
834 bool HTMLParser::isHeaderTag(const AtomicString& tagName)
836 static HashSet<AtomicStringImpl*> headerTags;
837 if (headerTags.isEmpty()) {
838 headerTags.add(h1Tag.localName().impl());
839 headerTags.add(h2Tag.localName().impl());
840 headerTags.add(h3Tag.localName().impl());
841 headerTags.add(h4Tag.localName().impl());
842 headerTags.add(h5Tag.localName().impl());
843 headerTags.add(h6Tag.localName().impl());
846 return headerTags.contains(tagName.impl());
849 void HTMLParser::popNestedHeaderTag()
851 // This function only cares about checking for nested headers that have only inlines in between them.
852 NodeImpl* currNode = current;
853 for (HTMLStackElem* curr = blockStack; curr; curr = curr->next) {
854 if (isHeaderTag(curr->tagName)) {
855 popBlock(curr->tagName);
858 if (currNode && !isInline(currNode))
860 currNode = curr->node.get();
864 bool HTMLParser::isInline(NodeImpl* node) const
866 if (node->isTextNode())
869 if (node->isHTMLElement()) {
870 HTMLElementImpl* e = static_cast<HTMLElementImpl*>(node);
871 if (e->hasLocalName(aTag) || e->hasLocalName(fontTag) || e->hasLocalName(ttTag) ||
872 e->hasLocalName(uTag) || e->hasLocalName(bTag) || e->hasLocalName(iTag) ||
873 e->hasLocalName(sTag) || e->hasLocalName(strikeTag) || e->hasLocalName(bigTag) ||
874 e->hasLocalName(smallTag) || e->hasLocalName(emTag) || e->hasLocalName(strongTag) ||
875 e->hasLocalName(dfnTag) || e->hasLocalName(codeTag) || e->hasLocalName(sampTag) ||
876 e->hasLocalName(kbdTag) || e->hasLocalName(varTag) || e->hasLocalName(citeTag) ||
877 e->hasLocalName(abbrTag) || e->hasLocalName(acronymTag) || e->hasLocalName(subTag) ||
878 e->hasLocalName(supTag) || e->hasLocalName(spanTag) || e->hasLocalName(nobrTag) ||
879 e->hasLocalName(wbrTag) || e->hasLocalName(noframesTag) || e->hasLocalName(nolayerTag) ||
880 e->hasLocalName(noembedTag) || (e->hasLocalName(noscriptTag) && !m_fragment && document->frame()->jScriptEnabled()))
887 bool HTMLParser::isResidualStyleTag(const AtomicString& tagName)
889 static HashSet<AtomicStringImpl*> residualStyleTags;
890 if (residualStyleTags.isEmpty()) {
891 residualStyleTags.add(aTag.localName().impl());
892 residualStyleTags.add(fontTag.localName().impl());
893 residualStyleTags.add(ttTag.localName().impl());
894 residualStyleTags.add(uTag.localName().impl());
895 residualStyleTags.add(bTag.localName().impl());
896 residualStyleTags.add(iTag.localName().impl());
897 residualStyleTags.add(sTag.localName().impl());
898 residualStyleTags.add(strikeTag.localName().impl());
899 residualStyleTags.add(bigTag.localName().impl());
900 residualStyleTags.add(smallTag.localName().impl());
901 residualStyleTags.add(emTag.localName().impl());
902 residualStyleTags.add(strongTag.localName().impl());
903 residualStyleTags.add(dfnTag.localName().impl());
904 residualStyleTags.add(codeTag.localName().impl());
905 residualStyleTags.add(sampTag.localName().impl());
906 residualStyleTags.add(kbdTag.localName().impl());
907 residualStyleTags.add(varTag.localName().impl());
908 residualStyleTags.add(nobrTag.localName().impl());
909 residualStyleTags.add(wbrTag.localName().impl());
912 return residualStyleTags.contains(tagName.impl());
915 bool HTMLParser::isAffectedByResidualStyle(const AtomicString& tagName)
917 if (isResidualStyleTag(tagName))
920 static HashSet<AtomicStringImpl*> affectedBlockTags;
921 if (affectedBlockTags.isEmpty()) {
922 affectedBlockTags.add(h1Tag.localName().impl());
923 affectedBlockTags.add(h2Tag.localName().impl());
924 affectedBlockTags.add(h3Tag.localName().impl());
925 affectedBlockTags.add(h4Tag.localName().impl());
926 affectedBlockTags.add(h5Tag.localName().impl());
927 affectedBlockTags.add(h6Tag.localName().impl());
928 affectedBlockTags.add(pTag.localName().impl());
929 affectedBlockTags.add(divTag.localName().impl());
930 affectedBlockTags.add(blockquoteTag.localName().impl());
931 affectedBlockTags.add(addressTag.localName().impl());
932 affectedBlockTags.add(centerTag.localName().impl());
933 affectedBlockTags.add(ulTag.localName().impl());
934 affectedBlockTags.add(olTag.localName().impl());
935 affectedBlockTags.add(liTag.localName().impl());
936 affectedBlockTags.add(dlTag.localName().impl());
937 affectedBlockTags.add(dtTag.localName().impl());
938 affectedBlockTags.add(ddTag.localName().impl());
939 affectedBlockTags.add(preTag.localName().impl());
940 affectedBlockTags.add(formTag.localName().impl());
943 return affectedBlockTags.contains(tagName.impl());
946 void HTMLParser::handleResidualStyleCloseTagAcrossBlocks(HTMLStackElem* elem)
948 // Find the element that crosses over to a higher level. For now, if there is more than
949 // one, we will just give up and not attempt any sort of correction. It's highly unlikely that
950 // there will be more than one, since <p> tags aren't allowed to be nested.
951 ExceptionCode ec = 0;
952 HTMLStackElem* curr = blockStack;
953 HTMLStackElem* maxElem = 0;
954 HTMLStackElem* prev = 0;
955 HTMLStackElem* prevMaxElem = 0;
956 while (curr && curr != elem) {
957 if (curr->level > elem->level) {
968 if (!curr || !maxElem || !isAffectedByResidualStyle(maxElem->tagName)) return;
970 NodeImpl* residualElem = prev->node.get();
971 NodeImpl* blockElem = prevMaxElem ? prevMaxElem->node.get() : current;
972 NodeImpl* parentElem = elem->node.get();
974 // Check to see if the reparenting that is going to occur is allowed according to the DOM.
975 // FIXME: We should either always allow it or perform an additional fixup instead of
976 // just bailing here.
977 // Example: <p><font><center>blah</font></center></p> isn't doing a fixup right now.
978 if (!parentElem->childAllowed(blockElem))
981 if (maxElem->node->parentNode() != elem->node) {
982 // Walk the stack and remove any elements that aren't residual style tags. These
983 // are basically just being closed up. Example:
984 // <font><span>Moo<p>Goo</font></p>.
985 // In the above example, the <span> doesn't need to be reopened. It can just close.
986 HTMLStackElem* currElem = maxElem->next;
987 HTMLStackElem* prevElem = maxElem;
988 while (currElem != elem) {
989 HTMLStackElem* nextElem = currElem->next;
990 if (!isResidualStyleTag(currElem->tagName)) {
991 prevElem->next = nextElem;
992 prevElem->node = currElem->node;
1000 // We have to reopen residual tags in between maxElem and elem. An example of this case is:
1001 // <font><i>Moo<p>Foo</font>.
1002 // In this case, we need to transform the part before the <p> into:
1003 // <font><i>Moo</i></font><i>
1004 // so that the <i> will remain open. This involves the modification of elements
1005 // in the block stack.
1006 // This will also affect how we ultimately reparent the block, since we want it to end up
1007 // under the reopened residual tags (e.g., the <i> in the above example.)
1008 NodeImpl* prevNode = 0;
1010 while (currElem->node != residualElem) {
1011 if (isResidualStyleTag(currElem->node->localName())) {
1012 // Create a clone of this element.
1013 RefPtr<NodeImpl> currNode = currElem->node->cloneNode(false);
1015 // Change the stack element's node to point to the clone.
1016 currElem->node = currNode;
1018 // Attach the previous node as a child of this new node.
1020 currNode->appendChild(prevNode, ec);
1021 else // The new parent for the block element is going to be the innermost clone.
1022 parentElem = currNode.get();
1024 prevNode = currNode.get();
1027 currElem = currElem->next;
1030 // Now append the chain of new residual style elements if one exists.
1032 elem->node->appendChild(prevNode, ec);
1035 // Check if the block is still in the tree. If it isn't, then we don't
1036 // want to remove it from its parent (that would crash) or insert it into
1037 // a new parent later. See http://bugzilla.opendarwin.org/show_bug.cgi?id=6778
1038 bool isBlockStillInTree = blockElem->parentNode();
1040 // We need to make a clone of |residualElem| and place it just inside |blockElem|.
1041 // All content of |blockElem| is reparented to be under this clone. We then
1042 // reparent |blockElem| using real DOM calls so that attachment/detachment will
1043 // be performed to fix up the rendering tree.
1044 // So for this example: <b>...<p>Foo</b>Goo</p>
1045 // The end result will be: <b>...</b><p><b>Foo</b>Goo</p>
1047 // Step 1: Remove |blockElem| from its parent, doing a batch detach of all the kids.
1048 if (isBlockStillInTree)
1049 blockElem->parentNode()->removeChild(blockElem, ec);
1051 // Step 2: Clone |residualElem|.
1052 RefPtr<NodeImpl> newNode = residualElem->cloneNode(false); // Shallow clone. We don't pick up the same kids.
1054 // Step 3: Place |blockElem|'s children under |newNode|. Remove all of the children of |blockElem|
1055 // before we've put |newElem| into the document. That way we'll only do one attachment of all
1056 // the new content (instead of a bunch of individual attachments).
1057 NodeImpl* currNode = blockElem->firstChild();
1059 NodeImpl* nextNode = currNode->nextSibling();
1060 newNode->appendChild(currNode, ec);
1061 currNode = nextNode;
1064 // Step 4: Place |newNode| under |blockElem|. |blockElem| is still out of the document, so no
1065 // attachment can occur yet.
1066 blockElem->appendChild(newNode.release(), ec);
1068 // Step 5: Reparent |blockElem|. Now the full attachment of the fixed up tree takes place.
1069 if (isBlockStillInTree)
1070 parentElem->appendChild(blockElem, ec);
1072 // Step 6: Elide |elem|, since it is effectively no longer open. Also update
1073 // the node associated with the previous stack element so that when it gets popped,
1074 // it doesn't make the residual element the next current node.
1075 HTMLStackElem* currElem = maxElem;
1076 HTMLStackElem* prevElem = 0;
1077 while (currElem != elem) {
1078 prevElem = currElem;
1079 currElem = currElem->next;
1081 prevElem->next = elem->next;
1082 prevElem->node = elem->node;
1085 // Step 7: Reopen intermediate inlines, e.g., <b><p><i>Foo</b>Goo</p>.
1086 // In the above example, Goo should stay italic.
1088 HTMLStackElem* residualStyleStack = 0;
1089 while (curr && curr != maxElem) {
1090 // We will actually schedule this tag for reopening
1091 // after we complete the close of this entire block.
1092 NodeImpl* currNode = current;
1093 if (isResidualStyleTag(curr->tagName)) {
1094 // We've overloaded the use of stack elements and are just reusing the
1095 // struct with a slightly different meaning to the variables. Instead of chaining
1096 // from innermost to outermost, we build up a list of all the tags we need to reopen
1097 // from the outermost to the innermost, i.e., residualStyleStack will end up pointing
1098 // to the outermost tag we need to reopen.
1099 // We also set curr->node to be the actual element that corresponds to the ID stored in
1100 // curr->id rather than the node that you should pop to when the element gets pulled off
1103 curr->node = currNode;
1104 curr->next = residualStyleStack;
1105 residualStyleStack = curr;
1113 reopenResidualStyleTags(residualStyleStack, 0); // FIXME: Deal with stray table content some day
1114 // if it becomes necessary to do so.
1117 void HTMLParser::reopenResidualStyleTags(HTMLStackElem* elem, NodeImpl* malformedTableParent)
1119 // Loop for each tag that needs to be reopened.
1121 // Create a shallow clone of the DOM node for this element.
1122 RefPtr<NodeImpl> newNode = elem->node->cloneNode(false);
1124 // Append the new node. In the malformed table case, we need to insert before the table,
1125 // which will be the last child.
1126 ExceptionCode ec = 0;
1127 if (malformedTableParent)
1128 malformedTableParent->insertBefore(newNode, malformedTableParent->lastChild(), ec);
1130 current->appendChild(newNode, ec);
1131 // FIXME: Is it really OK to ignore the exceptions here?
1133 // Now push a new stack element for this node we just created.
1134 pushBlock(elem->tagName, elem->level);
1136 // Set our strayTableContent boolean if needed, so that the reopened tag also knows
1137 // that it is inside a malformed table.
1138 blockStack->strayTableContent = malformedTableParent != 0;
1139 if (blockStack->strayTableContent)
1140 inStrayTableContent++;
1142 // Clear our malformed table parent variable.
1143 malformedTableParent = 0;
1145 // Update |current| manually to point to the new node.
1146 setCurrent(newNode.get());
1148 // Advance to the next tag that needs to be reopened.
1149 HTMLStackElem* next = elem->next;
1155 void HTMLParser::pushBlock(const AtomicString& tagName, int _level)
1157 HTMLStackElem *Elem = new HTMLStackElem(tagName, _level, current, blockStack);
1161 void HTMLParser::popBlock(const AtomicString& _tagName)
1163 HTMLStackElem *Elem = blockStack;
1167 while (Elem && (Elem->tagName != _tagName)) {
1168 if (maxLevel < Elem->level)
1169 maxLevel = Elem->level;
1176 if (maxLevel > Elem->level) {
1177 // We didn't match because the tag is in a different scope, e.g.,
1178 // <b><p>Foo</b>. Try to correct the problem.
1179 if (!isResidualStyleTag(_tagName))
1181 return handleResidualStyleCloseTagAcrossBlocks(Elem);
1184 bool isAffectedByStyle = isAffectedByResidualStyle(Elem->tagName);
1185 HTMLStackElem* residualStyleStack = 0;
1186 NodeImpl* malformedTableParent = 0;
1190 if (Elem->tagName == _tagName) {
1191 int strayTable = inStrayTableContent;
1195 // This element was the root of some malformed content just inside an implicit or
1196 // explicit <tbody> or <tr>.
1197 // If we end up needing to reopen residual style tags, the root of the reopened chain
1198 // must also know that it is the root of malformed content inside a <tbody>/<tr>.
1199 if (strayTable && (inStrayTableContent < strayTable) && residualStyleStack) {
1200 NodeImpl* curr = current;
1201 while (curr && !curr->hasTagName(tableTag))
1202 curr = curr->parentNode();
1203 malformedTableParent = curr ? curr->parentNode() : 0;
1207 if (form && Elem->tagName == formTag)
1208 // A <form> is being closed prematurely (and this is
1209 // malformed HTML). Set an attribute on the form to clear out its
1211 form->setMalformed(true);
1213 // Schedule this tag for reopening
1214 // after we complete the close of this entire block.
1215 NodeImpl* currNode = current;
1216 if (isAffectedByStyle && isResidualStyleTag(Elem->tagName)) {
1217 // We've overloaded the use of stack elements and are just reusing the
1218 // struct with a slightly different meaning to the variables. Instead of chaining
1219 // from innermost to outermost, we build up a list of all the tags we need to reopen
1220 // from the outermost to the innermost, i.e., residualStyleStack will end up pointing
1221 // to the outermost tag we need to reopen.
1222 // We also set Elem->node to be the actual element that corresponds to the ID stored in
1223 // Elem->id rather than the node that you should pop to when the element gets pulled off
1226 Elem->next = residualStyleStack;
1227 Elem->node = currNode;
1228 residualStyleStack = Elem;
1236 reopenResidualStyleTags(residualStyleStack, malformedTableParent);
1239 void HTMLParser::popOneBlock(bool delBlock)
1241 HTMLStackElem *Elem = blockStack;
1243 // we should never get here, but some bad html might cause it.
1246 if (current && Elem->node != current) {
1247 if (current->maintainsState() && doc()) {
1248 doc()->registerMaintainsState(current);
1249 QStringList &states = doc()->restoreState();
1250 if (!states.isEmpty())
1251 current->restoreState(states);
1254 // A few elements (<applet>, <object>) need to know when all child elements (<param>s) are available:
1255 current->closeRenderer();
1258 blockStack = Elem->next;
1259 setCurrent(Elem->node.get());
1261 if (Elem->strayTableContent)
1262 inStrayTableContent--;
1268 void HTMLParser::popInlineBlocks()
1270 while (blockStack && isInline(current))
1274 void HTMLParser::freeBlock()
1280 void HTMLParser::createHead()
1282 if (head || !doc()->firstChild())
1285 head = new HTMLHeadElementImpl(document);
1286 HTMLElementImpl* body = doc()->body();
1287 ExceptionCode ec = 0;
1288 doc()->firstChild()->insertBefore(head, body, ec);
1293 NodeImpl* HTMLParser::handleIsindex(Token* t)
1295 NodeImpl* n = new HTMLDivElementImpl(document);
1297 NamedMappedAttrMapImpl* attrs = t->attrs.get();
1299 RefPtr<HTMLIsIndexElementImpl> isIndex = new HTMLIsIndexElementImpl(document, form);
1300 isIndex->setAttributeMap(attrs);
1301 isIndex->setAttribute(typeAttr, "khtml_isindex");
1303 DOMString text = searchableIndexIntroduction();
1305 if (AttributeImpl *a = attrs->getAttributeItem(promptAttr))
1306 text = a->value().domString() + " ";
1310 n->addChild(new HTMLHRElementImpl(document));
1311 n->addChild(new TextImpl(document, text));
1312 n->addChild(isIndex.get());
1313 n->addChild(new HTMLHRElementImpl(document));
1318 void HTMLParser::startBody()
1325 insertNode(isindex.get(), true /* don't decend into this node */);
1330 void HTMLParser::finished()
1332 // In the case of a completely empty document, here's the place to create the HTML element.
1333 if (current && current->isDocumentNode() && !current->firstChild())
1334 insertNode(new HTMLHtmlElementImpl(document));
1336 // This ensures that "current" is not left pointing to a node when the document is destroyed.
1340 // Warning, this may delete the tokenizer and parser, so don't try to do anything else after this.
1342 document->finishedParsing();