2 * This file is part of the DOM implementation for KDE.
4 * Copyright (C) 2000 Peter Kelly (pmk@post.com)
5 * Copyright (C) 2005, 2006 Apple Computer, Inc.
6 * Copyright (C) 2006 Alexey Proskuryakov (ap@webkit.org)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 02111-1307, USA.
25 #include "XMLTokenizer.h"
27 #include "CDATASection.h"
30 #include "CachedScript.h"
32 #include "DocLoader.h"
34 #include "DocumentFragment.h"
35 #include "DocumentType.h"
36 #include "EventNames.h"
38 #include "FrameLoader.h"
39 #include "FrameView.h"
40 #include "HTMLNames.h"
41 #include "HTMLScriptElement.h"
42 #include "HTMLTableSectionElement.h"
43 #include "HTMLTokenizer.h"
44 #include "LoaderFunctions.h"
45 #include "ProcessingInstruction.h"
46 #include "ResourceHandle.h"
47 #include "ResourceRequest.h"
48 #include "ResourceResponse.h"
49 #include <libxml/parser.h>
50 #include <libxml/parserInternals.h>
51 #include <wtf/Platform.h>
52 #include <wtf/Vector.h>
55 #include <libxslt/xslt.h>
60 #include "XLinkNames.h"
67 using namespace EventNames;
68 using namespace HTMLNames;
70 const int maxErrors = 25;
72 typedef HashMap<StringImpl*, StringImpl*> PrefixForNamespaceMap;
74 class PendingCallbacks;
76 class XMLTokenizer : public Tokenizer, public CachedResourceClient {
78 XMLTokenizer(Document*, FrameView* = 0);
79 XMLTokenizer(DocumentFragment*, Element*);
82 enum ErrorType { warning, nonFatal, fatal };
85 virtual bool write(const SegmentedString& str, bool);
86 virtual void finish();
87 virtual bool isWaitingForScripts() const;
88 virtual void stopParsing();
95 void setIsXHTMLDocument(bool isXHTML) { m_isXHTMLDocument = isXHTML; }
96 bool isXHTMLDocument() const { return m_isXHTMLDocument; }
98 // from CachedResourceClient
99 virtual void notifyFinished(CachedResource* finishedObj);
101 // callbacks from parser SAX
102 void error(ErrorType, const char* message, va_list args);
103 void startElementNs(const xmlChar* xmlLocalName, const xmlChar* xmlPrefix, const xmlChar* xmlURI, int nb_namespaces, const xmlChar** namespaces, int nb_attributes, int nb_defaulted, const xmlChar** libxmlAttributes);
105 void characters(const xmlChar* s, int len);
106 void processingInstruction(const xmlChar* target, const xmlChar* data);
107 void cdataBlock(const xmlChar* s, int len);
108 void comment(const xmlChar* s);
109 void startDocument(const xmlChar* version, const xmlChar* encoding, int standalone);
110 void internalSubset(const xmlChar* name, const xmlChar* externalID, const xmlChar* systemID);
112 void handleError(ErrorType type, const char* m, int lineNumber, int columnNumber);
115 void initializeParserContext();
116 void setCurrentNode(Node*);
118 int lineNumber() const;
119 int columnNumber() const;
121 void insertErrorMessageBlock();
129 String m_originalSourceForTransform;
131 xmlParserCtxtPtr m_context;
133 bool m_currentNodeIsReferenced;
136 bool m_sawXSLTransform;
137 bool m_sawFirstElement;
138 bool m_isXHTMLDocument;
141 bool m_requestingScript;
146 int m_lastErrorColumn;
147 String m_errorMessages;
149 CachedScript* m_pendingScript;
150 RefPtr<Element> m_scriptElement;
151 int m_scriptStartLine;
153 bool m_parsingFragment;
154 String m_defaultNamespaceURI;
155 PrefixForNamespaceMap m_prefixToNamespaceMap;
157 PendingCallbacks* m_pendingCallbacks;
158 SegmentedString m_pendingSrc;
161 class PendingCallbacks {
165 m_callbacks.setAutoDelete(true);
168 void appendStartElementNSCallback(const xmlChar* xmlLocalName, const xmlChar* xmlPrefix, const xmlChar* xmlURI, int nb_namespaces, const xmlChar** namespaces, int nb_attributes, int nb_defaulted, const xmlChar** attributes)
170 PendingStartElementNSCallback* callback = new PendingStartElementNSCallback;
172 callback->xmlLocalName = xmlStrdup(xmlLocalName);
173 callback->xmlPrefix = xmlStrdup(xmlPrefix);
174 callback->xmlURI = xmlStrdup(xmlURI);
175 callback->nb_namespaces = nb_namespaces;
176 callback->namespaces = reinterpret_cast<xmlChar**>(xmlMalloc(sizeof(xmlChar*) * nb_namespaces * 2));
177 for (int i = 0; i < nb_namespaces * 2 ; i++)
178 callback->namespaces[i] = xmlStrdup(namespaces[i]);
179 callback->nb_attributes = nb_attributes;
180 callback->nb_defaulted = nb_defaulted;
181 callback->attributes = reinterpret_cast<xmlChar**>(xmlMalloc(sizeof(xmlChar*) * nb_attributes * 5));
182 for (int i = 0; i < nb_attributes; i++) {
183 // Each attribute has 5 elements in the array:
184 // name, prefix, uri, value and an end pointer.
186 for (int j = 0; j < 3; j++)
187 callback->attributes[i * 5 + j] = xmlStrdup(attributes[i * 5 + j]);
189 int len = attributes[i * 5 + 4] - attributes[i * 5 + 3];
191 callback->attributes[i * 5 + 3] = xmlStrndup(attributes[i * 5 + 3], len);
192 callback->attributes[i * 5 + 4] = callback->attributes[i * 5 + 3] + len;
195 m_callbacks.append(callback);
198 void appendEndElementNSCallback()
200 PendingEndElementNSCallback* callback = new PendingEndElementNSCallback;
202 m_callbacks.append(callback);
205 void appendCharactersCallback(const xmlChar* s, int len)
207 PendingCharactersCallback* callback = new PendingCharactersCallback;
209 callback->s = xmlStrndup(s, len);
212 m_callbacks.append(callback);
215 void appendProcessingInstructionCallback(const xmlChar* target, const xmlChar* data)
217 PendingProcessingInstructionCallback* callback = new PendingProcessingInstructionCallback;
219 callback->target = xmlStrdup(target);
220 callback->data = xmlStrdup(data);
222 m_callbacks.append(callback);
225 void appendCDATABlockCallback(const xmlChar* s, int len)
227 PendingCDATABlockCallback* callback = new PendingCDATABlockCallback;
229 callback->s = xmlStrndup(s, len);
232 m_callbacks.append(callback);
235 void appendCommentCallback(const xmlChar* s)
237 PendingCommentCallback* callback = new PendingCommentCallback;
239 callback->s = xmlStrdup(s);
241 m_callbacks.append(callback);
244 void appendInternalSubsetCallback(const xmlChar* name, const xmlChar* externalID, const xmlChar* systemID)
246 PendingInternalSubsetCallback* callback = new PendingInternalSubsetCallback;
248 callback->name = xmlStrdup(name);
249 callback->externalID = xmlStrdup(externalID);
250 callback->systemID = xmlStrdup(systemID);
252 m_callbacks.append(callback);
255 void appendErrorCallback(XMLTokenizer::ErrorType type, const char* message, int lineNumber, int columnNumber)
257 PendingErrorCallback* callback = new PendingErrorCallback;
259 callback->message = strdup(message);
260 callback->type = type;
261 callback->lineNumber = lineNumber;
262 callback->columnNumber = columnNumber;
264 m_callbacks.append(callback);
267 void callAndRemoveFirstCallback(XMLTokenizer* tokenizer)
269 PendingCallback* cb = m_callbacks.getFirst();
272 m_callbacks.removeFirst();
275 bool isEmpty() const { return m_callbacks.isEmpty(); }
278 struct PendingCallback {
280 virtual ~PendingCallback() { }
282 virtual void call(XMLTokenizer* tokenizer) = 0;
285 struct PendingStartElementNSCallback : public PendingCallback {
286 virtual ~PendingStartElementNSCallback() {
287 xmlFree(xmlLocalName);
290 for (int i = 0; i < nb_namespaces * 2; i++)
291 xmlFree(namespaces[i]);
293 for (int i = 0; i < nb_attributes; i++)
294 for (int j = 0; j < 4; j++)
295 xmlFree(attributes[i * 5 + j]);
299 virtual void call(XMLTokenizer* tokenizer) {
300 tokenizer->startElementNs(xmlLocalName, xmlPrefix, xmlURI,
301 nb_namespaces, (const xmlChar**)namespaces,
302 nb_attributes, nb_defaulted, (const xmlChar**)(attributes));
305 xmlChar* xmlLocalName;
309 xmlChar** namespaces;
312 xmlChar** attributes;
315 struct PendingEndElementNSCallback : public PendingCallback {
316 virtual void call(XMLTokenizer* tokenizer)
318 tokenizer->endElementNs();
322 struct PendingCharactersCallback : public PendingCallback {
323 virtual ~PendingCharactersCallback()
328 virtual void call(XMLTokenizer* tokenizer)
330 tokenizer->characters(s, len);
337 struct PendingProcessingInstructionCallback : public PendingCallback {
338 virtual ~PendingProcessingInstructionCallback()
344 virtual void call(XMLTokenizer* tokenizer)
346 tokenizer->processingInstruction(target, data);
353 struct PendingCDATABlockCallback : public PendingCallback {
354 virtual ~PendingCDATABlockCallback()
359 virtual void call(XMLTokenizer* tokenizer)
361 tokenizer->cdataBlock(s, len);
368 struct PendingCommentCallback : public PendingCallback {
369 virtual ~PendingCommentCallback()
374 virtual void call(XMLTokenizer* tokenizer)
376 tokenizer->comment(s);
382 struct PendingInternalSubsetCallback : public PendingCallback {
383 virtual ~PendingInternalSubsetCallback()
390 virtual void call(XMLTokenizer* tokenizer)
392 tokenizer->internalSubset(name, externalID, systemID);
400 struct PendingErrorCallback: public PendingCallback {
401 virtual ~PendingErrorCallback()
406 virtual void call(XMLTokenizer* tokenizer)
408 tokenizer->handleError(type, message, lineNumber, columnNumber);
411 XMLTokenizer::ErrorType type;
418 DeprecatedPtrList<PendingCallback> m_callbacks;
421 // --------------------------------
423 static int globalDescriptor = 0;
425 static int matchFunc(const char* uri)
427 return 1; // Match everything.
430 static DocLoader* globalDocLoader = 0;
434 OffsetBuffer(const Vector<char>& b) : m_buffer(b), m_currentOffset(0) { }
436 int readOutBytes(char* outputBuffer, unsigned askedToRead) {
437 unsigned bytesLeft = m_buffer.size() - m_currentOffset;
438 unsigned lenToCopy = min(askedToRead, bytesLeft);
440 memcpy(outputBuffer, m_buffer.data() + m_currentOffset, lenToCopy);
441 m_currentOffset += lenToCopy;
447 Vector<char> m_buffer;
448 unsigned m_currentOffset;
451 static bool shouldAllowExternalLoad(const char* inURI)
453 if (strstr(inURI, "/etc/xml/catalog")
454 || strstr(inURI, "http://www.w3.org/Graphics/SVG") == inURI
455 || strstr(inURI, "http://www.w3.org/TR/xhtml") == inURI)
460 static void* openFunc(const char* uri)
462 if (!globalDocLoader || !shouldAllowExternalLoad(uri))
463 return &globalDescriptor;
465 ResourceResponse response;
466 Vector<char> data = ServeSynchronousRequest(cache()->loader(), globalDocLoader, KURL(uri), response);
468 return new OffsetBuffer(data);
471 static int readFunc(void* context, char* buffer, int len)
473 // Do 0-byte reads in case of a null descriptor
474 if (context == &globalDescriptor)
477 OffsetBuffer* data = static_cast<OffsetBuffer*>(context);
478 return data->readOutBytes(buffer, len);
481 static int writeFunc(void* context, const char* buffer, int len)
483 // Always just do 0-byte writes
487 static int closeFunc(void* context)
489 if (context != &globalDescriptor) {
490 OffsetBuffer* data = static_cast<OffsetBuffer*>(context);
496 static void errorFunc(void*, const char*, ...)
498 // FIXME: It would be nice to display error messages somewhere.
501 void setLoaderForLibXMLCallbacks(DocLoader* docLoader)
503 globalDocLoader = docLoader;
506 static xmlParserCtxtPtr createStringParser(xmlSAXHandlerPtr handlers, void* userData)
508 static bool didInit = false;
511 xmlRegisterInputCallbacks(matchFunc, openFunc, readFunc, closeFunc);
512 xmlRegisterOutputCallbacks(matchFunc, openFunc, writeFunc, closeFunc);
516 xmlParserCtxtPtr parser = xmlCreatePushParserCtxt(handlers, 0, 0, 0, 0);
517 parser->_private = userData;
518 parser->replaceEntities = true;
519 const UChar BOM = 0xFEFF;
520 const unsigned char BOMHighByte = *reinterpret_cast<const unsigned char*>(&BOM);
521 xmlSwitchEncoding(parser, BOMHighByte == 0xFF ? XML_CHAR_ENCODING_UTF16LE : XML_CHAR_ENCODING_UTF16BE);
525 // --------------------------------
527 XMLTokenizer::XMLTokenizer(Document* _doc, FrameView* _view)
531 , m_currentNode(_doc)
532 , m_currentNodeIsReferenced(false)
534 , m_sawXSLTransform(false)
535 , m_sawFirstElement(false)
536 , m_isXHTMLDocument(false)
537 , m_parserPaused(false)
538 , m_requestingScript(false)
539 , m_finishCalled(false)
542 , m_lastErrorColumn(0)
544 , m_scriptStartLine(0)
545 , m_parsingFragment(false)
546 , m_pendingCallbacks(new PendingCallbacks)
550 XMLTokenizer::XMLTokenizer(DocumentFragment* fragment, Element* parentElement)
551 : m_doc(fragment->document())
554 , m_currentNode(fragment)
555 , m_currentNodeIsReferenced(fragment)
557 , m_sawXSLTransform(false)
558 , m_sawFirstElement(false)
559 , m_isXHTMLDocument(false)
560 , m_parserPaused(false)
561 , m_requestingScript(false)
562 , m_finishCalled(false)
565 , m_lastErrorColumn(0)
567 , m_scriptStartLine(0)
568 , m_parsingFragment(true)
569 , m_pendingCallbacks(new PendingCallbacks)
576 // Add namespaces based on the parent node
577 Vector<Element*> elemStack;
578 while (parentElement) {
579 elemStack.append(parentElement);
581 Node* n = parentElement->parentNode();
582 if (!n || !n->isElementNode())
584 parentElement = static_cast<Element*>(n);
587 if (elemStack.isEmpty())
590 for (Element* element = elemStack.last(); !elemStack.isEmpty(); elemStack.removeLast()) {
591 if (NamedAttrMap* attrs = element->attributes()) {
592 for (unsigned i = 0; i < attrs->length(); i++) {
593 Attribute* attr = attrs->attributeItem(i);
594 if (attr->localName() == "xmlns")
595 m_defaultNamespaceURI = attr->value();
596 else if (attr->prefix() == "xmlns")
597 m_prefixToNamespaceMap.set(attr->localName().impl(), attr->value().impl());
603 XMLTokenizer::~XMLTokenizer()
606 if (m_parsingFragment && m_doc)
608 delete m_pendingCallbacks;
610 m_pendingScript->deref(this);
613 void XMLTokenizer::setCurrentNode(Node* n)
615 bool nodeNeedsReference = n && n != m_doc;
616 if (nodeNeedsReference)
618 if (m_currentNodeIsReferenced)
619 m_currentNode->deref();
621 m_currentNodeIsReferenced = nodeNeedsReference;
624 bool XMLTokenizer::write(const SegmentedString& s, bool /*appendData*/)
626 String parseString = s.toString();
628 if (m_sawXSLTransform || !m_sawFirstElement)
629 m_originalSourceForTransform += parseString;
631 if (m_parserStopped || m_sawXSLTransform)
634 if (m_parserPaused) {
635 m_pendingSrc.append(s);
640 initializeParserContext();
642 // libXML throws an error if you try to switch the encoding for an empty string.
643 if (parseString.length()) {
644 // Hack around libxml2's lack of encoding overide support by manually
645 // resetting the encoding to UTF-16 before every chunk. Otherwise libxml
646 // will detect <?xml version="1.0" encoding="<encoding name>"?> blocks
647 // and switch encodings, causing the parse to fail.
648 const UChar BOM = 0xFEFF;
649 const unsigned char BOMHighByte = *reinterpret_cast<const unsigned char*>(&BOM);
650 xmlSwitchEncoding(m_context, BOMHighByte == 0xFF ? XML_CHAR_ENCODING_UTF16LE : XML_CHAR_ENCODING_UTF16BE);
652 xmlParseChunk(m_context, reinterpret_cast<const char*>(parseString.characters()), sizeof(UChar) * parseString.length(), 0);
658 inline String toString(const xmlChar* str, unsigned len)
660 return UTF8Encoding().decode(reinterpret_cast<const char*>(str), len);
663 inline String toString(const xmlChar* str)
665 const char* cstr = str ? reinterpret_cast<const char*>(str) : "";
666 return UTF8Encoding().decode(cstr, strlen(cstr));
669 struct _xmlSAX2Namespace {
670 const xmlChar* prefix;
673 typedef struct _xmlSAX2Namespace xmlSAX2Namespace;
675 static inline void handleElementNamespaces(Element* newElement, const xmlChar** libxmlNamespaces, int nb_namespaces, ExceptionCode& ec)
677 xmlSAX2Namespace* namespaces = reinterpret_cast<xmlSAX2Namespace*>(libxmlNamespaces);
678 for(int i = 0; i < nb_namespaces; i++) {
679 String namespaceQName = "xmlns";
680 String namespaceURI = toString(namespaces[i].uri);
681 if (namespaces[i].prefix)
682 namespaceQName = "xmlns:" + toString(namespaces[i].prefix);
683 newElement->setAttributeNS("http://www.w3.org/2000/xmlns/", namespaceQName, namespaceURI, ec);
684 if (ec) // exception setting attributes
689 struct _xmlSAX2Attributes {
690 const xmlChar* localname;
691 const xmlChar* prefix;
693 const xmlChar* value;
696 typedef struct _xmlSAX2Attributes xmlSAX2Attributes;
698 static inline void handleElementAttributes(Element* newElement, const xmlChar** libxmlAttributes, int nb_attributes, ExceptionCode& ec)
700 xmlSAX2Attributes* attributes = reinterpret_cast<xmlSAX2Attributes*>(libxmlAttributes);
701 for(int i = 0; i < nb_attributes; i++) {
702 String attrLocalName = toString(attributes[i].localname);
703 int valueLength = (int) (attributes[i].end - attributes[i].value);
704 String attrValue = toString(attributes[i].value, valueLength);
705 String attrPrefix = toString(attributes[i].prefix);
706 String attrURI = attrPrefix.isEmpty() ? String() : toString(attributes[i].uri);
707 String attrQName = attrPrefix.isEmpty() ? attrLocalName : attrPrefix + ":" + attrLocalName;
709 newElement->setAttributeNS(attrURI, attrQName, attrValue, ec);
710 if (ec) // exception setting attributes
715 void XMLTokenizer::startElementNs(const xmlChar* xmlLocalName, const xmlChar* xmlPrefix, const xmlChar* xmlURI, int nb_namespaces, const xmlChar** libxmlNamespaces, int nb_attributes, int nb_defaulted, const xmlChar** libxmlAttributes)
720 if (m_parserPaused) {
721 m_pendingCallbacks->appendStartElementNSCallback(xmlLocalName, xmlPrefix, xmlURI, nb_namespaces, libxmlNamespaces, nb_attributes, nb_defaulted, libxmlAttributes);
725 m_sawFirstElement = true;
729 String localName = toString(xmlLocalName);
730 String uri = toString(xmlURI);
731 String prefix = toString(xmlPrefix);
732 String qName = prefix.isEmpty() ? localName : prefix + ":" + localName;
734 if (m_parsingFragment && uri.isEmpty()) {
735 if (!prefix.isEmpty())
736 uri = String(m_prefixToNamespaceMap.get(prefix.impl()));
738 uri = m_defaultNamespaceURI;
741 ExceptionCode ec = 0;
742 RefPtr<Element> newElement = m_doc->createElementNS(uri, qName, ec);
748 handleElementNamespaces(newElement.get(), libxmlNamespaces, nb_namespaces, ec);
754 handleElementAttributes(newElement.get(), libxmlAttributes, nb_attributes, ec);
760 // FIXME: This hack ensures implicit table bodies get constructed in XHTML and XML files.
761 // We want to consolidate this with the HTML parser and HTML DOM code at some point.
762 // For now, it's too risky to rip that code up.
763 if (m_currentNode->hasTagName(tableTag) && newElement->hasTagName(trTag)) {
764 RefPtr<Node> implicitTBody = new HTMLTableSectionElement(tbodyTag, m_doc, true /* implicit */);
765 m_currentNode->addChild(implicitTBody.get());
766 setCurrentNode(implicitTBody.get());
767 if (m_view && !implicitTBody->attached())
768 implicitTBody->attach();
771 if (newElement->hasTagName(scriptTag))
772 static_cast<HTMLScriptElement*>(newElement.get())->setCreatedByParser(true);
774 if (newElement->hasTagName(HTMLNames::scriptTag)
776 || newElement->hasTagName(SVGNames::scriptTag)
779 m_scriptStartLine = lineNumber();
781 if (!m_currentNode->addChild(newElement.get())) {
786 setCurrentNode(newElement.get());
787 if (m_view && !newElement->attached())
788 newElement->attach();
791 void XMLTokenizer::endElementNs()
796 if (m_parserPaused) {
797 m_pendingCallbacks->appendEndElementNSCallback();
803 Node* n = m_currentNode;
804 while (n->implicitNode())
806 RefPtr<Node> parent = n->parentNode();
809 // don't load external scripts for standalone documents (for now)
810 if (n->isElementNode() && m_view && (static_cast<Element*>(n)->hasTagName(scriptTag)
812 || static_cast<Element*>(n)->hasTagName(SVGNames::scriptTag)
817 ASSERT(!m_pendingScript);
819 m_requestingScript = true;
821 Element* scriptElement = static_cast<Element*>(n);
824 if (static_cast<Element*>(n)->hasTagName(scriptTag))
825 scriptHref = scriptElement->getAttribute(srcAttr);
827 else if (static_cast<Element*>(n)->hasTagName(SVGNames::scriptTag))
828 scriptHref = scriptElement->getAttribute(XLinkNames::hrefAttr);
831 if (!scriptHref.isEmpty()) {
832 // we have a src attribute
833 const AtomicString& charset = scriptElement->getAttribute(charsetAttr);
834 if ((m_pendingScript = m_doc->docLoader()->requestScript(scriptHref, charset))) {
835 m_scriptElement = scriptElement;
836 m_pendingScript->ref(this);
838 // m_pendingScript will be 0 if script was already loaded and ref() executed it
845 String scriptCode = "";
846 for (Node* child = scriptElement->firstChild(); child; child = child->nextSibling()) {
847 if (child->isTextNode() || child->nodeType() == Node::CDATA_SECTION_NODE)
848 scriptCode += static_cast<CharacterData*>(child)->data();
850 m_view->frame()->loader()->executeScript(m_doc->URL(), m_scriptStartLine - 1, 0, scriptCode);
853 m_requestingScript = false;
856 setCurrentNode(parent.get());
859 void XMLTokenizer::characters(const xmlChar* s, int len)
864 if (m_parserPaused) {
865 m_pendingCallbacks->appendCharactersCallback(s, len);
869 if (m_currentNode->isTextNode() || enterText()) {
870 ExceptionCode ec = 0;
871 static_cast<Text*>(m_currentNode)->appendData(toString(s, len), ec);
875 bool XMLTokenizer::enterText()
877 RefPtr<Node> newNode = new Text(m_doc, "");
878 if (!m_currentNode->addChild(newNode.get()))
880 setCurrentNode(newNode.get());
884 void XMLTokenizer::exitText()
889 if (!m_currentNode || !m_currentNode->isTextNode())
892 if (m_view && m_currentNode && !m_currentNode->attached())
893 m_currentNode->attach();
895 // FIXME: What's the right thing to do if the parent is really 0?
896 // Just leaving the current node set to the text node doesn't make much sense.
897 if (Node* par = m_currentNode->parentNode())
901 void XMLTokenizer::handleError(ErrorType type, const char* m, int lineNumber, int columnNumber)
903 if (type == fatal || (m_errorCount < maxErrors && m_lastErrorLine != lineNumber && m_lastErrorColumn != columnNumber)) {
906 m_errorMessages += String::format("warning on line %d at column %d: %s", lineNumber, columnNumber, m);
910 m_errorMessages += String::format("error on line %d at column %d: %s", lineNumber, columnNumber, m);
913 m_lastErrorLine = lineNumber;
914 m_lastErrorColumn = columnNumber;
925 void XMLTokenizer::error(ErrorType type, const char* message, va_list args)
932 vsnprintf(m, sizeof(m) - 1, message, args);
935 vasprintf(&m, message, args);
939 m_pendingCallbacks->appendErrorCallback(type, m, lineNumber(), columnNumber());
941 handleError(type, m, lineNumber(), columnNumber());
943 #if !PLATFORM(WIN_OS)
948 void XMLTokenizer::processingInstruction(const xmlChar* target, const xmlChar* data)
953 if (m_parserPaused) {
954 m_pendingCallbacks->appendProcessingInstructionCallback(target, data);
960 // ### handle exceptions
962 RefPtr<ProcessingInstruction> pi = m_doc->createProcessingInstruction(
963 toString(target), toString(data), exception);
967 if (!m_currentNode->addChild(pi.get()))
969 if (m_view && !pi->attached())
972 // don't load stylesheets for standalone documents
973 if (m_doc->frame()) {
974 m_sawXSLTransform = !m_sawFirstElement && !pi->checkStyleSheet();
976 // Pretend we didn't see this PI if we're the result of a transform.
977 if (m_sawXSLTransform && !m_doc->transformSourceDocument())
979 if (m_sawXSLTransform)
981 // Stop the SAX parser.
986 void XMLTokenizer::cdataBlock(const xmlChar* s, int len)
991 if (m_parserPaused) {
992 m_pendingCallbacks->appendCDATABlockCallback(s, len);
998 RefPtr<Node> newNode = new CDATASection(m_doc, toString(s, len));
999 if (!m_currentNode->addChild(newNode.get()))
1001 if (m_view && !newNode->attached())
1005 void XMLTokenizer::comment(const xmlChar* s)
1007 if (m_parserStopped)
1010 if (m_parserPaused) {
1011 m_pendingCallbacks->appendCommentCallback(s);
1017 RefPtr<Node> newNode = new Comment(m_doc, toString(s));
1018 m_currentNode->addChild(newNode.get());
1019 if (m_view && !newNode->attached())
1023 void XMLTokenizer::startDocument(const xmlChar* version, const xmlChar* encoding, int standalone)
1025 ExceptionCode ec = 0;
1028 m_doc->setXMLVersion(toString(version), ec);
1029 m_doc->setXMLStandalone(standalone == 1, ec); // possible values are 0, 1, and -1
1031 m_doc->setXMLEncoding(toString(encoding));
1034 void XMLTokenizer::internalSubset(const xmlChar* name, const xmlChar* externalID, const xmlChar* systemID)
1036 if (m_parserStopped)
1039 if (m_parserPaused) {
1040 m_pendingCallbacks->appendInternalSubsetCallback(name, externalID, systemID);
1044 Document* doc = m_doc;
1048 doc->setDocType(new DocumentType(doc, toString(name), toString(externalID), toString(systemID)));
1051 inline XMLTokenizer* getTokenizer(void* closure)
1053 xmlParserCtxtPtr ctxt = static_cast<xmlParserCtxtPtr>(closure);
1054 return static_cast<XMLTokenizer*>(ctxt->_private);
1057 // This is a hack around http://bugzilla.gnome.org/show_bug.cgi?id=159219
1058 // Otherwise libxml seems to call all the SAX callbacks twice for any replaced entity.
1059 static inline bool hackAroundLibXMLEntityBug(void* closure)
1061 return static_cast<xmlParserCtxtPtr>(closure)->node;
1064 static void startElementNsHandler(void* closure, const xmlChar* localname, const xmlChar* prefix, const xmlChar* uri, int nb_namespaces, const xmlChar** namespaces, int nb_attributes, int nb_defaulted, const xmlChar** libxmlAttributes)
1066 if (hackAroundLibXMLEntityBug(closure))
1069 getTokenizer(closure)->startElementNs(localname, prefix, uri, nb_namespaces, namespaces, nb_attributes, nb_defaulted, libxmlAttributes);
1072 static void endElementNsHandler(void* closure, const xmlChar* localname, const xmlChar* prefix, const xmlChar* uri)
1074 if (hackAroundLibXMLEntityBug(closure))
1077 getTokenizer(closure)->endElementNs();
1080 static void charactersHandler(void* closure, const xmlChar* s, int len)
1082 if (hackAroundLibXMLEntityBug(closure))
1085 getTokenizer(closure)->characters(s, len);
1088 static void processingInstructionHandler(void* closure, const xmlChar* target, const xmlChar* data)
1090 if (hackAroundLibXMLEntityBug(closure))
1093 getTokenizer(closure)->processingInstruction(target, data);
1096 static void cdataBlockHandler(void* closure, const xmlChar* s, int len)
1098 if (hackAroundLibXMLEntityBug(closure))
1101 getTokenizer(closure)->cdataBlock(s, len);
1104 static void commentHandler(void* closure, const xmlChar* comment)
1106 if (hackAroundLibXMLEntityBug(closure))
1109 getTokenizer(closure)->comment(comment);
1112 static void warningHandler(void* closure, const char* message, ...)
1115 va_start(args, message);
1116 getTokenizer(closure)->error(XMLTokenizer::warning, message, args);
1120 static void fatalErrorHandler(void* closure, const char* message, ...)
1123 va_start(args, message);
1124 getTokenizer(closure)->error(XMLTokenizer::fatal, message, args);
1128 static void normalErrorHandler(void* closure, const char* message, ...)
1131 va_start(args, message);
1132 getTokenizer(closure)->error(XMLTokenizer::nonFatal, message, args);
1136 // Using a global variable entity and marking it XML_INTERNAL_PREDEFINED_ENTITY is
1137 // a hack to avoid malloc/free. Using a global variable like this could cause trouble
1138 // if libxml implementation details were to change
1139 static xmlChar sharedXHTMLEntityResult[5] = {0,0,0,0,0};
1140 static xmlEntity sharedXHTMLEntity = {
1141 0, XML_ENTITY_DECL, 0, 0, 0, 0, 0, 0, 0,
1142 sharedXHTMLEntityResult, sharedXHTMLEntityResult, 0,
1143 XML_INTERNAL_PREDEFINED_ENTITY, 0, 0, 0, 0, 0
1146 static xmlEntityPtr getXHTMLEntity(const xmlChar* name)
1148 UChar c = decodeNamedEntity(reinterpret_cast<const char*>(name));
1152 CString value = String(&c, 1).utf8();
1153 assert(value.length() < 5);
1154 sharedXHTMLEntity.length = value.length();
1155 sharedXHTMLEntity.name = name;
1156 memcpy(sharedXHTMLEntityResult, value.data(), sharedXHTMLEntity.length + 1);
1158 return &sharedXHTMLEntity;
1161 static xmlEntityPtr getEntityHandler(void* closure, const xmlChar* name)
1163 xmlParserCtxtPtr ctxt = static_cast<xmlParserCtxtPtr>(closure);
1164 xmlEntityPtr ent = xmlGetPredefinedEntity(name);
1168 ent = xmlGetDocEntity(ctxt->myDoc, name);
1169 if (!ent && getTokenizer(closure)->isXHTMLDocument())
1170 ent = getXHTMLEntity(name);
1175 static void startDocumentHandler(void* closure)
1177 xmlParserCtxt* ctxt = static_cast<xmlParserCtxt*>(closure);
1178 getTokenizer(closure)->startDocument(ctxt->version, ctxt->encoding, ctxt->standalone);
1179 xmlSAX2StartDocument(closure);
1182 static void internalSubsetHandler(void* closure, const xmlChar* name, const xmlChar* externalID, const xmlChar* systemID)
1184 getTokenizer(closure)->internalSubset(name, externalID, systemID);
1185 xmlSAX2InternalSubset(closure, name, externalID, systemID);
1188 static void externalSubsetHandler(void* closure, const xmlChar* name, const xmlChar* externalId, const xmlChar* systemId)
1190 String extId = toString(externalId);
1191 if ((extId == "-//W3C//DTD XHTML 1.0 Transitional//EN")
1192 || (extId == "-//W3C//DTD XHTML 1.1//EN")
1193 || (extId == "-//W3C//DTD XHTML 1.0 Strict//EN")
1194 || (extId == "-//W3C//DTD XHTML 1.0 Frameset//EN")
1195 || (extId == "-//W3C//DTD XHTML Basic 1.0//EN")
1196 || (extId == "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN")
1197 || (extId == "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN")
1198 || (extId == "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"))
1199 getTokenizer(closure)->setIsXHTMLDocument(true); // controls if we replace entities or not.
1202 static void ignorableWhitespaceHandler(void* ctx, const xmlChar* ch, int len)
1204 // nothing to do, but we need this to work around a crasher
1205 // http://bugzilla.gnome.org/show_bug.cgi?id=172255
1206 // http://bugs.webkit.org/show_bug.cgi?id=5792
1209 void XMLTokenizer::initializeParserContext()
1212 memset(&sax, 0, sizeof(sax));
1213 sax.error = normalErrorHandler;
1214 sax.fatalError = fatalErrorHandler;
1215 sax.characters = charactersHandler;
1216 sax.processingInstruction = processingInstructionHandler;
1217 sax.cdataBlock = cdataBlockHandler;
1218 sax.comment = commentHandler;
1219 sax.warning = warningHandler;
1220 sax.startElementNs = startElementNsHandler;
1221 sax.endElementNs = endElementNsHandler;
1222 sax.getEntity = getEntityHandler;
1223 sax.startDocument = startDocumentHandler;
1224 sax.internalSubset = internalSubsetHandler;
1225 sax.externalSubset = externalSubsetHandler;
1226 sax.ignorableWhitespace = ignorableWhitespaceHandler;
1227 sax.entityDecl = xmlSAX2EntityDecl;
1228 sax.initialized = XML_SAX2_MAGIC;
1230 m_parserStopped = false;
1232 m_sawXSLTransform = false;
1233 m_sawFirstElement = false;
1234 m_context = createStringParser(&sax, this);
1237 void XMLTokenizer::end()
1240 if (m_sawXSLTransform) {
1241 m_doc->setTransformSource(xmlDocPtrForString(m_doc->docLoader(), m_originalSourceForTransform, m_doc->URL()));
1243 m_doc->setParsing(false); // Make the doc think it's done, so it will apply xsl sheets.
1244 m_doc->updateStyleSelector();
1245 m_doc->setParsing(true);
1246 m_parserStopped = true;
1251 // Tell libxml we're done.
1252 xmlParseChunk(m_context, 0, 0, 1);
1254 if (m_context->myDoc)
1255 xmlFreeDoc(m_context->myDoc);
1256 xmlFreeParserCtxt(m_context);
1261 insertErrorMessageBlock();
1264 m_doc->updateStyleSelector();
1268 m_doc->finishedParsing();
1271 void XMLTokenizer::finish()
1274 m_finishCalled = true;
1279 static inline RefPtr<Element> createXHTMLParserErrorHeader(Document* doc, const String& errorMessages)
1281 ExceptionCode ec = 0;
1282 RefPtr<Element> reportElement = doc->createElementNS(xhtmlNamespaceURI, "parsererror", ec);
1283 reportElement->setAttribute(styleAttr, "white-space: pre; border: 2px solid #c77; padding: 0 1em 0 1em; margin: 1em; background-color: #fdd; color: black");
1285 RefPtr<Element> h3 = doc->createElementNS(xhtmlNamespaceURI, "h3", ec);
1286 reportElement->appendChild(h3.get(), ec);
1287 h3->appendChild(doc->createTextNode("This page contains the following errors:"), ec);
1289 RefPtr<Element> fixed = doc->createElementNS(xhtmlNamespaceURI, "div", ec);
1290 reportElement->appendChild(fixed.get(), ec);
1291 fixed->setAttribute(styleAttr, "font-family:monospace;font-size:12px");
1292 fixed->appendChild(doc->createTextNode(errorMessages), ec);
1294 h3 = doc->createElementNS(xhtmlNamespaceURI, "h3", ec);
1295 reportElement->appendChild(h3.get(), ec);
1296 h3->appendChild(doc->createTextNode("Below is a rendering of the page up to the first error."), ec);
1298 return reportElement;
1301 void XMLTokenizer::insertErrorMessageBlock()
1303 // One or more errors occurred during parsing of the code. Display an error block to the user above
1304 // the normal content (the DOM tree is created manually and includes line/col info regarding
1305 // where the errors are located)
1307 // Create elements for display
1308 ExceptionCode ec = 0;
1309 Document* doc = m_doc;
1310 Node* documentElement = doc->documentElement();
1311 if (!documentElement) {
1312 RefPtr<Node> rootElement = doc->createElementNS(xhtmlNamespaceURI, "html", ec);
1313 doc->appendChild(rootElement, ec);
1314 RefPtr<Node> body = doc->createElementNS(xhtmlNamespaceURI, "body", ec);
1315 rootElement->appendChild(body, ec);
1316 documentElement = body.get();
1319 else if (documentElement->namespaceURI() == SVGNames::svgNamespaceURI) {
1320 // Until our SVG implementation has text support, it is best if we
1321 // wrap the erroneous SVG document in an xhtml document and render
1322 // the combined document with error messages.
1323 RefPtr<Node> rootElement = doc->createElementNS(xhtmlNamespaceURI, "html", ec);
1324 RefPtr<Node> body = doc->createElementNS(xhtmlNamespaceURI, "body", ec);
1325 rootElement->appendChild(body, ec);
1326 body->appendChild(documentElement, ec);
1327 doc->appendChild(rootElement.get(), ec);
1328 documentElement = body.get();
1332 RefPtr<Element> reportElement = createXHTMLParserErrorHeader(doc, m_errorMessages);
1333 documentElement->insertBefore(reportElement, documentElement->firstChild(), ec);
1335 if (doc->transformSourceDocument()) {
1336 RefPtr<Element> par = doc->createElementNS(xhtmlNamespaceURI, "p", ec);
1337 reportElement->appendChild(par, ec);
1338 par->setAttribute(styleAttr, "white-space: normal");
1339 par->appendChild(doc->createTextNode("This document was created as the result of an XSL transformation. The line and column numbers given are from the transformed result."), ec);
1342 doc->updateRendering();
1345 void XMLTokenizer::notifyFinished(CachedResource* finishedObj)
1347 ASSERT(m_pendingScript == finishedObj);
1348 ASSERT(m_pendingScript->accessCount() > 0);
1350 String cachedScriptUrl = m_pendingScript->url();
1351 String scriptSource = m_pendingScript->script();
1352 bool errorOccurred = m_pendingScript->errorOccurred();
1353 m_pendingScript->deref(this);
1354 m_pendingScript = 0;
1356 RefPtr<Element> e = m_scriptElement;
1357 m_scriptElement = 0;
1360 EventTargetNodeCast(e.get())->dispatchHTMLEvent(errorEvent, true, false);
1362 m_view->frame()->loader()->executeScript(cachedScriptUrl, 0, 0, scriptSource);
1363 EventTargetNodeCast(e.get())->dispatchHTMLEvent(loadEvent, false, false);
1366 m_scriptElement = 0;
1368 if (!m_requestingScript)
1372 bool XMLTokenizer::isWaitingForScripts() const
1374 return m_pendingScript != 0;
1378 void* xmlDocPtrForString(DocLoader* docLoader, const String& source, const DeprecatedString& url)
1380 if (source.isEmpty())
1383 // Parse in a single chunk into an xmlDocPtr
1384 // FIXME: Hook up error handlers so that a failure to parse the main document results in
1385 // good error messages.
1386 const UChar BOM = 0xFEFF;
1387 const unsigned char BOMHighByte = *reinterpret_cast<const unsigned char*>(&BOM);
1389 xmlGenericErrorFunc oldErrorFunc = xmlGenericError;
1390 void* oldErrorContext = xmlGenericErrorContext;
1392 setLoaderForLibXMLCallbacks(docLoader);
1393 xmlSetGenericErrorFunc(0, errorFunc);
1395 xmlDocPtr sourceDoc = xmlReadMemory(reinterpret_cast<const char*>(source.characters()),
1396 source.length() * sizeof(UChar),
1398 BOMHighByte == 0xFF ? "UTF-16LE" : "UTF-16BE",
1399 XSLT_PARSE_OPTIONS);
1401 setLoaderForLibXMLCallbacks(0);
1402 xmlSetGenericErrorFunc(oldErrorContext, oldErrorFunc);
1408 Tokenizer* newXMLTokenizer(Document* d, FrameView* v)
1410 return new XMLTokenizer(d, v);
1413 int XMLTokenizer::lineNumber() const
1415 return m_context->input->line;
1418 int XMLTokenizer::columnNumber() const
1420 return m_context->input->col;
1423 void XMLTokenizer::stopParsing()
1425 Tokenizer::stopParsing();
1426 xmlStopParser(m_context);
1429 void XMLTokenizer::pauseParsing()
1431 if (m_parsingFragment)
1434 m_parserPaused = true;
1437 void XMLTokenizer::resumeParsing()
1439 ASSERT(m_parserPaused);
1441 m_parserPaused = false;
1443 // First, execute any pending callbacks
1444 while (!m_pendingCallbacks->isEmpty()) {
1445 m_pendingCallbacks->callAndRemoveFirstCallback(this);
1447 // A callback paused the parser
1452 // Then, write any pending data
1453 SegmentedString rest = m_pendingSrc;
1454 m_pendingSrc.clear();
1457 // Finally, if finish() has been called and write() didn't result
1458 // in any further callbacks being queued, call end()
1459 if (m_finishCalled && m_pendingCallbacks->isEmpty())
1463 static void balancedStartElementNsHandler(void* closure, const xmlChar* localname, const xmlChar* prefix, const xmlChar* uri, int nb_namespaces, const xmlChar** namespaces, int nb_attributes, int nb_defaulted, const xmlChar** libxmlAttributes)
1465 static_cast<XMLTokenizer*>(closure)->startElementNs(localname, prefix, uri, nb_namespaces, namespaces, nb_attributes, nb_defaulted, libxmlAttributes);
1468 static void balancedEndElementNsHandler(void* closure, const xmlChar* localname, const xmlChar* prefix, const xmlChar* uri)
1470 static_cast<XMLTokenizer*>(closure)->endElementNs();
1473 static void balancedCharactersHandler(void* closure, const xmlChar* s, int len)
1475 static_cast<XMLTokenizer*>(closure)->characters(s, len);
1478 static void balancedProcessingInstructionHandler(void* closure, const xmlChar* target, const xmlChar* data)
1480 static_cast<XMLTokenizer*>(closure)->processingInstruction(target, data);
1483 static void balancedCdataBlockHandler(void* closure, const xmlChar* s, int len)
1485 static_cast<XMLTokenizer*>(closure)->cdataBlock(s, len);
1488 static void balancedCommentHandler(void* closure, const xmlChar* comment)
1490 static_cast<XMLTokenizer*>(closure)->comment(comment);
1493 static void balancedWarningHandler(void* closure, const char* message, ...)
1496 va_start(args, message);
1497 static_cast<XMLTokenizer*>(closure)->error(XMLTokenizer::warning, message, args);
1501 bool parseXMLDocumentFragment(const String& string, DocumentFragment* fragment, Element* parent)
1503 XMLTokenizer tokenizer(fragment, parent);
1506 memset(&sax, 0, sizeof(sax));
1508 sax.characters = balancedCharactersHandler;
1509 sax.processingInstruction = balancedProcessingInstructionHandler;
1510 sax.startElementNs = balancedStartElementNsHandler;
1511 sax.endElementNs = balancedEndElementNsHandler;
1512 sax.cdataBlock = balancedCdataBlockHandler;
1513 sax.ignorableWhitespace = balancedCdataBlockHandler;
1514 sax.comment = balancedCommentHandler;
1515 sax.warning = balancedWarningHandler;
1516 sax.initialized = XML_SAX2_MAGIC;
1518 int result = xmlParseBalancedChunkMemory(0, &sax, &tokenizer, 0, (const xmlChar*)(const char*)(string.utf8()), 0);
1522 // --------------------------------
1524 struct AttributeParseState {
1525 HashMap<String, String> attributes;
1530 static void attributesStartElementNsHandler(void* closure, const xmlChar* xmlLocalName, const xmlChar* xmlPrefix, const xmlChar* xmlURI, int nb_namespaces, const xmlChar** namespaces, int nb_attributes, int nb_defaulted, const xmlChar** libxmlAttributes)
1532 if (strcmp(reinterpret_cast<const char*>(xmlLocalName), "attrs") != 0)
1535 xmlParserCtxtPtr ctxt = static_cast<xmlParserCtxtPtr>(closure);
1536 AttributeParseState* state = static_cast<AttributeParseState*>(ctxt->_private);
1538 state->gotAttributes = true;
1540 xmlSAX2Attributes* attributes = reinterpret_cast<xmlSAX2Attributes*>(libxmlAttributes);
1541 for(int i = 0; i < nb_attributes; i++) {
1542 String attrLocalName = toString(attributes[i].localname);
1543 int valueLength = (int) (attributes[i].end - attributes[i].value);
1544 String attrValue = toString(attributes[i].value, valueLength);
1545 String attrPrefix = toString(attributes[i].prefix);
1546 String attrQName = attrPrefix.isEmpty() ? attrLocalName : attrPrefix + ":" + attrLocalName;
1548 state->attributes.set(attrQName, attrValue);
1552 HashMap<String, String> parseAttributes(const String& string, bool& attrsOK)
1554 AttributeParseState state;
1555 state.gotAttributes = false;
1558 memset(&sax, 0, sizeof(sax));
1559 sax.startElementNs = attributesStartElementNsHandler;
1560 sax.initialized = XML_SAX2_MAGIC;
1561 xmlParserCtxtPtr parser = createStringParser(&sax, &state);
1562 String parseString = "<?xml version=\"1.0\"?><attrs " + string + " />";
1563 xmlParseChunk(parser, reinterpret_cast<const char*>(parseString.characters()), parseString.length() * sizeof(UChar), 1);
1565 xmlFreeDoc(parser->myDoc);
1566 xmlFreeParserCtxt(parser);
1568 attrsOK = state.gotAttributes;
1569 return state.attributes;