2011-01-27 Adam Barth <abarth@webkit.org>
authorabarth@webkit.org <abarth@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 27 Jan 2011 21:52:56 +0000 (21:52 +0000)
committerabarth@webkit.org <abarth@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Thu, 27 Jan 2011 21:52:56 +0000 (21:52 +0000)
        Reviewed by Eric Seidel.

        Generalize the mechanism view-source uses to remember the source for an HTMLToken
        https://bugs.webkit.org/show_bug.cgi?id=53200

        Currently view-source tracks the source associated with each HTMLToken.
        We want to re-use this mechanism for the new XSS auditor.  This patch
        moves this code into its own class so it can be shared between the
        view-source parser and the general HTML parser.  This patch also add
        support for tracking the source of tokens that span document.write
        boundaries.

        No functional change.  This code change is somewhat tested by our
        view-source layout tests.

        * Android.mk:
        * GNUmakefile.am:
        * WebCore.gypi:
        * WebCore.pro:
        * WebCore.vcproj/WebCore.vcproj:
        * WebCore.xcodeproj/project.pbxproj:
            - Fun with updating build files.
        * html/parser/HTMLDocumentParser.cpp:
        (WebCore::HTMLDocumentParser::pumpTokenizer):
            - Teach HTMLDocumentParser to track the source for HTMLTokens.
              Currently, this information isn't used, but it will be shortly.
              I ran the HTML parser benchmark and this change didn't have a
              measurable effect.
        * html/parser/HTMLDocumentParser.h:
            - Composite in the HTMLSourceTracker.
        * html/parser/HTMLSourceTracker.cpp: Added.
        (WebCore::HTMLSourceTracker::HTMLSourceTracker):
        (WebCore::HTMLSourceTracker::start):
        (WebCore::HTMLSourceTracker::end):
            - This function should eventualy be folded into HTMLTokenizer.
        (WebCore::HTMLSourceTracker::sourceForToken):
        * html/parser/HTMLSourceTracker.h: Added.
        * html/parser/HTMLToken.h:
            - Now HTMLTokens always have a start index of zero.  To do the job
              of the old start index, this patch introduces the notion of a
              baseOffset.  Unlike the start index (which was used as the base
              offset for all the other indicies), the baseOffset can change
              over the lifetime of the token.  We need the flexibility to
              change the offset for tokens that span document.write boundaries.
              Values are now normalized to zero-offset when stored.
        (WebCore::HTMLToken::clear):
        (WebCore::HTMLToken::setBaseOffset):
        (WebCore::HTMLToken::end):
        (WebCore::HTMLToken::beginAttributeName):
        (WebCore::HTMLToken::endAttributeName):
        (WebCore::HTMLToken::beginAttributeValue):
        (WebCore::HTMLToken::endAttributeValue):
        * html/parser/HTMLViewSourceParser.cpp:
            - Updates the HTMLViewSourceParser to use the new
              HTMLSourceTracker.
        (WebCore::HTMLViewSourceParser::pumpTokenizer):
        (WebCore::HTMLViewSourceParser::append):
        (WebCore::HTMLViewSourceParser::sourceForToken):
            - This function now just calls through to HTMLSourceTracker.
        * html/parser/HTMLViewSourceParser.h:
        * platform/text/SegmentedString.cpp:
        (WebCore::SegmentedString::currentColumn):
        (WebCore::SegmentedString::setCurrentPosition):
        * platform/text/SegmentedString.h:
        (WebCore::SegmentedString::numberOfCharactersConsumed):
            - We need to handle the general case now.  The "slow" version
              doesn't turn out to be any slower in practice anyway.

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@76835 268f45cc-cd09-0410-ab3c-d52691b4dbfc

16 files changed:
Source/WebCore/Android.mk
Source/WebCore/ChangeLog
Source/WebCore/GNUmakefile.am
Source/WebCore/WebCore.gypi
Source/WebCore/WebCore.pro
Source/WebCore/WebCore.vcproj/WebCore.vcproj
Source/WebCore/WebCore.xcodeproj/project.pbxproj
Source/WebCore/html/parser/HTMLDocumentParser.cpp
Source/WebCore/html/parser/HTMLDocumentParser.h
Source/WebCore/html/parser/HTMLSourceTracker.cpp [new file with mode: 0644]
Source/WebCore/html/parser/HTMLSourceTracker.h [new file with mode: 0644]
Source/WebCore/html/parser/HTMLToken.h
Source/WebCore/html/parser/HTMLViewSourceParser.cpp
Source/WebCore/html/parser/HTMLViewSourceParser.h
Source/WebCore/platform/text/SegmentedString.cpp
Source/WebCore/platform/text/SegmentedString.h

index 8541aa3..dcb100f 100644 (file)
@@ -340,6 +340,7 @@ LOCAL_SRC_FILES := $(LOCAL_SRC_FILES) \
        html/parser/HTMLParserScheduler.cpp \
        html/parser/HTMLPreloadScanner.cpp \
        html/parser/HTMLScriptRunner.cpp \
+       html/parser/HTMLSourceTracker.cpp \
        html/parser/HTMLTokenizer.cpp \
        html/parser/HTMLTreeBuilder.cpp \
        html/parser/HTMLViewSourceParser.cpp \
index 35cb2eb..a43db7e 100644 (file)
@@ -1,3 +1,73 @@
+2011-01-27  Adam Barth  <abarth@webkit.org>
+
+        Reviewed by Eric Seidel.
+
+        Generalize the mechanism view-source uses to remember the source for an HTMLToken
+        https://bugs.webkit.org/show_bug.cgi?id=53200
+
+        Currently view-source tracks the source associated with each HTMLToken.
+        We want to re-use this mechanism for the new XSS auditor.  This patch
+        moves this code into its own class so it can be shared between the
+        view-source parser and the general HTML parser.  This patch also add
+        support for tracking the source of tokens that span document.write
+        boundaries.
+
+        No functional change.  This code change is somewhat tested by our
+        view-source layout tests.
+
+        * Android.mk:
+        * GNUmakefile.am:
+        * WebCore.gypi:
+        * WebCore.pro:
+        * WebCore.vcproj/WebCore.vcproj:
+        * WebCore.xcodeproj/project.pbxproj:
+            - Fun with updating build files.
+        * html/parser/HTMLDocumentParser.cpp:
+        (WebCore::HTMLDocumentParser::pumpTokenizer):
+            - Teach HTMLDocumentParser to track the source for HTMLTokens.
+              Currently, this information isn't used, but it will be shortly.
+              I ran the HTML parser benchmark and this change didn't have a
+              measurable effect.
+        * html/parser/HTMLDocumentParser.h:
+            - Composite in the HTMLSourceTracker.
+        * html/parser/HTMLSourceTracker.cpp: Added.
+        (WebCore::HTMLSourceTracker::HTMLSourceTracker):
+        (WebCore::HTMLSourceTracker::start):
+        (WebCore::HTMLSourceTracker::end):
+            - This function should eventualy be folded into HTMLTokenizer.
+        (WebCore::HTMLSourceTracker::sourceForToken):
+        * html/parser/HTMLSourceTracker.h: Added.
+        * html/parser/HTMLToken.h:
+            - Now HTMLTokens always have a start index of zero.  To do the job
+              of the old start index, this patch introduces the notion of a
+              baseOffset.  Unlike the start index (which was used as the base
+              offset for all the other indicies), the baseOffset can change
+              over the lifetime of the token.  We need the flexibility to
+              change the offset for tokens that span document.write boundaries.
+              Values are now normalized to zero-offset when stored.
+        (WebCore::HTMLToken::clear):
+        (WebCore::HTMLToken::setBaseOffset):
+        (WebCore::HTMLToken::end):
+        (WebCore::HTMLToken::beginAttributeName):
+        (WebCore::HTMLToken::endAttributeName):
+        (WebCore::HTMLToken::beginAttributeValue):
+        (WebCore::HTMLToken::endAttributeValue):
+        * html/parser/HTMLViewSourceParser.cpp:
+            - Updates the HTMLViewSourceParser to use the new
+              HTMLSourceTracker.
+        (WebCore::HTMLViewSourceParser::pumpTokenizer):
+        (WebCore::HTMLViewSourceParser::append):
+        (WebCore::HTMLViewSourceParser::sourceForToken):
+            - This function now just calls through to HTMLSourceTracker.
+        * html/parser/HTMLViewSourceParser.h:
+        * platform/text/SegmentedString.cpp:
+        (WebCore::SegmentedString::currentColumn):
+        (WebCore::SegmentedString::setCurrentPosition):
+        * platform/text/SegmentedString.h:
+        (WebCore::SegmentedString::numberOfCharactersConsumed):
+            - We need to handle the general case now.  The "slow" version
+              doesn't turn out to be any slower in practice anyway.
+
 2011-01-27  Sam Weinig  <sam@webkit.org>
 
         Fix all the builds.
index 84d1df6..2349fdf 100644 (file)
@@ -1866,6 +1866,8 @@ webcore_sources += \
        Source/WebCore/html/parser/HTMLScriptRunner.cpp \
        Source/WebCore/html/parser/HTMLScriptRunner.h \
        Source/WebCore/html/parser/HTMLScriptRunnerHost.h \
+       Source/WebCore/html/parser/HTMLSourceTracker.cpp \
+       Source/WebCore/html/parser/HTMLSourceTracker.h \
        Source/WebCore/html/parser/HTMLToken.h \
        Source/WebCore/html/parser/HTMLTokenizer.cpp \
        Source/WebCore/html/parser/HTMLTokenizer.h \
index 3a96529..0248578 100644 (file)
             'html/parser/HTMLScriptRunner.cpp',
             'html/parser/HTMLScriptRunner.h',
             'html/parser/HTMLScriptRunnerHost.h',
+            'html/parser/HTMLSourceTracker.cpp',
+            'html/parser/HTMLSourceTracker.h',
             'html/parser/HTMLToken.h',
             'html/parser/HTMLTokenizer.cpp',
             'html/parser/HTMLTokenizer.h',
index aec3ecb..c6caa2a 100644 (file)
@@ -1033,6 +1033,7 @@ SOURCES += \
     html/parser/HTMLParserScheduler.cpp \
     html/parser/HTMLPreloadScanner.cpp \
     html/parser/HTMLScriptRunner.cpp \
+    html/parser/HTMLSourceTracker.cpp \
     html/parser/HTMLTokenizer.cpp \
     html/parser/HTMLTreeBuilder.cpp \
     html/parser/HTMLViewSourceParser.cpp \
index 74740d7..90b41e4 100755 (executable)
                                        >
                                </File>
                                <File
+                                       RelativePath="..\html\parser\HTMLSourceTracker.cpp"
+                                       >
+                               </File>
+                               <File
+                                       RelativePath="..\html\parser\HTMLSourceTracker.h"
+                                       >
+                               </File>
+                               <File
                                        RelativePath="..\html\parser\HTMLToken.h"
                                        >
                                </File>
index d9f1105..6417c97 100644 (file)
                977B3878122883E900B81FF8 /* HTMLTokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 977B385F122883E900B81FF8 /* HTMLTokenizer.h */; };
                977B3879122883E900B81FF8 /* HTMLViewSourceParser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 977B3860122883E900B81FF8 /* HTMLViewSourceParser.cpp */; };
                977B387A122883E900B81FF8 /* HTMLViewSourceParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 977B3861122883E900B81FF8 /* HTMLViewSourceParser.h */; };
+               977E2DCD12F0E28300C13379 /* HTMLSourceTracker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 977E2DCB12F0E28300C13379 /* HTMLSourceTracker.cpp */; };
+               977E2DCE12F0E28300C13379 /* HTMLSourceTracker.h in Headers */ = {isa = PBXBuildFile; fileRef = 977E2DCC12F0E28300C13379 /* HTMLSourceTracker.h */; };
                979F43D31075E44A0000F83B /* NavigationScheduler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 979F43D11075E44A0000F83B /* NavigationScheduler.cpp */; };
                979F43D41075E44A0000F83B /* NavigationScheduler.h in Headers */ = {isa = PBXBuildFile; fileRef = 979F43D21075E44A0000F83B /* NavigationScheduler.h */; settings = {ATTRIBUTES = (Private, ); }; };
                97BC84831236FD93000C6161 /* TextDocumentParser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 97BC84811236FD93000C6161 /* TextDocumentParser.cpp */; };
                977B385F122883E900B81FF8 /* HTMLTokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = HTMLTokenizer.h; path = parser/HTMLTokenizer.h; sourceTree = "<group>"; };
                977B3860122883E900B81FF8 /* HTMLViewSourceParser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = HTMLViewSourceParser.cpp; path = parser/HTMLViewSourceParser.cpp; sourceTree = "<group>"; };
                977B3861122883E900B81FF8 /* HTMLViewSourceParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = HTMLViewSourceParser.h; path = parser/HTMLViewSourceParser.h; sourceTree = "<group>"; };
+               977E2DCB12F0E28300C13379 /* HTMLSourceTracker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = HTMLSourceTracker.cpp; path = parser/HTMLSourceTracker.cpp; sourceTree = "<group>"; };
+               977E2DCC12F0E28300C13379 /* HTMLSourceTracker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = HTMLSourceTracker.h; path = parser/HTMLSourceTracker.h; sourceTree = "<group>"; };
                979F43D11075E44A0000F83B /* NavigationScheduler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NavigationScheduler.cpp; sourceTree = "<group>"; };
                979F43D21075E44A0000F83B /* NavigationScheduler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = NavigationScheduler.h; sourceTree = "<group>"; };
                97BC84811236FD93000C6161 /* TextDocumentParser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = TextDocumentParser.cpp; path = parser/TextDocumentParser.cpp; sourceTree = "<group>"; };
                                977B385B122883E900B81FF8 /* HTMLScriptRunner.cpp */,
                                977B385C122883E900B81FF8 /* HTMLScriptRunner.h */,
                                977B385D122883E900B81FF8 /* HTMLScriptRunnerHost.h */,
+                               977E2DCB12F0E28300C13379 /* HTMLSourceTracker.cpp */,
+                               977E2DCC12F0E28300C13379 /* HTMLSourceTracker.h */,
                                97C1F552122855CB00EDE616 /* HTMLToken.h */,
                                977B385E122883E900B81FF8 /* HTMLTokenizer.cpp */,
                                977B385F122883E900B81FF8 /* HTMLTokenizer.h */,
                                4F2D205412EAE7B3005C2874 /* InspectorAgent.h in Headers */,
                                BC9585E112F0989500755821 /* PlatformGestureEvent.h in Headers */,
                                E134F5AB12EE343F004EC58D /* IntRectHash.h in Headers */,
+                               977E2DCE12F0E28300C13379 /* HTMLSourceTracker.h in Headers */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
                                97DD4D860FDF4D6E00ECF9A4 /* XSSAuditor.cpp in Sources */,
                                BC8AE34E12EA096A00EB3AE6 /* ScrollableArea.cpp in Sources */,
                                4F2D205512EAE7B3005C2874 /* InspectorAgent.cpp in Sources */,
+                               977E2DCD12F0E28300C13379 /* HTMLSourceTracker.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
index c9d5e10..6072e45 100644 (file)
@@ -226,8 +226,11 @@ void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode)
         if (!m_treeBuilder->isParsingFragment()
             && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending())
             break;
+
+        m_sourceTracker.start(m_input, m_token);
         if (!m_tokenizer->nextToken(m_input.current(), m_token))
             break;
+        m_sourceTracker.end(m_input, m_token);
 
         m_treeBuilder->constructTreeFromToken(m_token);
         m_token.clear();
index f925269..78f6136 100644 (file)
@@ -30,6 +30,7 @@
 #include "FragmentScriptingPermission.h"
 #include "HTMLInputStream.h"
 #include "HTMLScriptRunnerHost.h"
+#include "HTMLSourceTracker.h"
 #include "HTMLToken.h"
 #include "ScriptableDocumentParser.h"
 #include "SegmentedString.h"
@@ -141,6 +142,7 @@ private:
     OwnPtr<HTMLTreeBuilder> m_treeBuilder;
     OwnPtr<HTMLPreloadScanner> m_preloadScanner;
     OwnPtr<HTMLParserScheduler> m_parserScheduler;
+    HTMLSourceTracker m_sourceTracker;
 
     bool m_endWasDelayed;
     unsigned m_writeNestingLevel;
diff --git a/Source/WebCore/html/parser/HTMLSourceTracker.cpp b/Source/WebCore/html/parser/HTMLSourceTracker.cpp
new file mode 100644 (file)
index 0000000..9d8328f
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2010 Adam Barth. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "HTMLSourceTracker.h"
+
+namespace WebCore {
+
+HTMLSourceTracker::HTMLSourceTracker()
+{
+}
+
+void HTMLSourceTracker::start(const HTMLInputStream& input, HTMLToken& token)
+{
+    m_sourceFromPreviousSegments = token.type() == HTMLToken::Uninitialized ? String() : m_sourceFromPreviousSegments + m_source.toString();
+    m_source = input.current();
+    token.setBaseOffset(input.current().numberOfCharactersConsumed() - m_sourceFromPreviousSegments.length());
+}
+
+void HTMLSourceTracker::end(const HTMLInputStream& input, HTMLToken& token)
+{
+    // FIXME: This work should really be done by the HTMLTokenizer.
+    token.end(input.current().numberOfCharactersConsumed());
+}
+
+String HTMLSourceTracker::sourceForToken(const HTMLToken& token)
+{
+    if (token.type() == HTMLToken::EndOfFile)
+        return String(); // Hides the null character we use to mark the end of file.
+
+    ASSERT(!token.startIndex());
+    UChar* data = 0;
+    int length = token.endIndex() - token.startIndex() - m_sourceFromPreviousSegments.length();
+    String source = String::createUninitialized(length, data);
+    for (int i = 0; i < length; ++i) {
+        data[i] = *m_source;
+        m_source.advance();
+    }
+    return m_sourceFromPreviousSegments + source;
+}
+
+}
diff --git a/Source/WebCore/html/parser/HTMLSourceTracker.h b/Source/WebCore/html/parser/HTMLSourceTracker.h
new file mode 100644 (file)
index 0000000..df322b9
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2010 Adam Barth. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HTMLSourceTracker_h
+#define HTMLSourceTracker_h
+
+#include "HTMLInputStream.h"
+#include "HTMLToken.h"
+
+namespace WebCore {
+
+class HTMLSourceTracker {
+    WTF_MAKE_NONCOPYABLE(HTMLSourceTracker);
+public:
+    HTMLSourceTracker();
+
+    // FIXME: Once we move "end" into HTMLTokenizer, rename "start" to
+    // something that makes it obvious that this method can be called multiple
+    // times.
+    void start(const HTMLInputStream&, HTMLToken&);
+    void end(const HTMLInputStream&, HTMLToken&);
+
+    String sourceForToken(const HTMLToken&);
+
+private:
+    String m_sourceFromPreviousSegments;
+    SegmentedString m_source;
+};
+
+}
+
+#endif
index 1cbc151..a2af2a4 100644 (file)
@@ -64,20 +64,26 @@ public:
 
     HTMLToken() { clear(); }
 
-    void clear(int startIndex = 0)
+    void clear()
     {
         m_type = Uninitialized;
-        m_range.m_start = startIndex;
-        m_range.m_end = startIndex;
+        m_range.m_start = 0;
+        m_range.m_end = 0;
+        m_baseOffset = 0;
         m_data.clear();
     }
 
     int startIndex() const { return m_range.m_start; }
     int endIndex() const { return m_range.m_end; }
 
-    void end(int endIndex)
+    void setBaseOffset(int offset)
     {
-        m_range.m_end = endIndex;
+        m_baseOffset = offset;
+    }
+
+    void end(int endOffset)
+    {
+        m_range.m_end = endOffset - m_baseOffset;
     }
 
     void makeEndOfFile()
@@ -172,29 +178,30 @@ public:
 #endif
     }
 
-    void beginAttributeName(int index)
+    void beginAttributeName(int offset)
     {
-        m_currentAttribute->m_nameRange.m_start = index;
+        m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset;
     }
 
-    void endAttributeName(int index)
+    void endAttributeName(int offset)
     {
+        int index = offset - m_baseOffset;
         m_currentAttribute->m_nameRange.m_end = index;
         m_currentAttribute->m_valueRange.m_start = index;
         m_currentAttribute->m_valueRange.m_end = index;
     }
 
-    void beginAttributeValue(int index)
+    void beginAttributeValue(int offset)
     {
-        m_currentAttribute->m_valueRange.m_start = index;
+        m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset;
 #ifndef NDEBUG
         m_currentAttribute->m_valueRange.m_end = 0;
 #endif
     }
 
-    void endAttributeValue(int index)
+    void endAttributeValue(int offset)
     {
-        m_currentAttribute->m_valueRange.m_end = index;
+        m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset;
     }
 
     void appendToAttributeName(UChar character)
@@ -331,9 +338,8 @@ private:
     };
 
     Type m_type;
-
-    // Which characters from the input stream are represented by this token.
-    Range m_range;
+    Range m_range; // Always starts at zero.
+    int m_baseOffset;
 
     // "name" for DOCTYPE, StartTag, and EndTag
     // "characters" for Character
index ace8590..7cdbdc7 100644 (file)
@@ -49,35 +49,27 @@ void HTMLViewSourceParser::insert(const SegmentedString&)
 
 void HTMLViewSourceParser::pumpTokenizer()
 {
-    while (m_tokenizer->nextToken(m_input.current(), m_token)) {
-        m_token.end(m_input.current().numberOfCharactersConsumed());
+    while (true) {
+        m_sourceTracker.start(m_input, m_token);
+        if (!m_tokenizer->nextToken(m_input.current(), m_token))
+            break;
+        m_sourceTracker.end(m_input, m_token);
+
         document()->addSource(sourceForToken(), m_token);
         updateTokenizerState();
-        m_token.clear(m_input.current().numberOfCharactersConsumed());
+        m_token.clear();
     }
 }
 
 void HTMLViewSourceParser::append(const SegmentedString& input)
 {
     m_input.appendToEnd(input);
-    m_source.append(input);
     pumpTokenizer();
 }
 
 String HTMLViewSourceParser::sourceForToken()
 {
-    if (m_token.type() == HTMLToken::EndOfFile)
-        return String();
-
-    ASSERT(m_source.numberOfCharactersConsumed() == m_token.startIndex());
-    UChar* data = 0;
-    int length = m_token.endIndex() - m_token.startIndex();
-    String source = String::createUninitialized(length, data);
-    for (int i = 0; i < length; ++i) {
-        data[i] = *m_source;
-        m_source.advance();
-    }
-    return source;
+    return m_sourceTracker.sourceForToken(m_token);
 }
 
 void HTMLViewSourceParser::updateTokenizerState()
index abe55b4..2e6ddfe 100644 (file)
@@ -28,6 +28,7 @@
 
 #include "DecodedDataDocumentParser.h"
 #include "HTMLInputStream.h"
+#include "HTMLSourceTracker.h"
 #include "HTMLToken.h"
 #include "HTMLTokenizer.h"
 #include "HTMLViewSourceDocument.h"
@@ -69,8 +70,8 @@ private:
     void updateTokenizerState();
 
     HTMLInputStream m_input;
-    SegmentedString m_source;
     HTMLToken m_token;
+    HTMLSourceTracker m_sourceTracker;
     OwnPtr<HTMLTokenizer> m_tokenizer;
 };
 
index 5e9755b..7c859dc 100644 (file)
@@ -186,17 +186,6 @@ void SegmentedString::advanceSubstring()
     }
 }
 
-int SegmentedString::numberOfCharactersConsumedSlow() const
-{
-    int result = m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed();
-    if (m_pushedChar1) {
-        --result;
-        if (m_pushedChar2)
-            --result;
-    }
-    return result;
-}
-
 String SegmentedString::toString() const
 {
     String result;
@@ -262,14 +251,14 @@ WTF::ZeroBasedNumber SegmentedString::currentLine() const
 
 WTF::ZeroBasedNumber SegmentedString::currentColumn() const
 {
-    int zeroBasedColumn = numberOfCharactersConsumedSlow() - m_numberOfCharactersConsumedPriorToCurrentLine;
+    int zeroBasedColumn = numberOfCharactersConsumed() - m_numberOfCharactersConsumedPriorToCurrentLine;
     return WTF::ZeroBasedNumber::fromZeroBasedInt(zeroBasedColumn);
 }
 
 void SegmentedString::setCurrentPosition(WTF::ZeroBasedNumber line, WTF::ZeroBasedNumber columnAftreProlog, int prologLength)
 {
     m_currentLine = line.zeroBasedInt();
-    m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumedSlow() + prologLength - columnAftreProlog.zeroBasedInt();
+    m_numberOfCharactersConsumedPriorToCurrentLine = numberOfCharactersConsumed() + prologLength - columnAftreProlog.zeroBasedInt();
 }
 
 }
index 30c899d..3784b50 100644 (file)
@@ -206,13 +206,15 @@ public:
 
     int numberOfCharactersConsumed() const
     {
-        // We don't currently handle the case when there are pushed character.
-        ASSERT(!m_pushedChar1);
-        return m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed();
+        int numberOfPushedCharacters = 0;
+        if (m_pushedChar1) {
+            ++numberOfPushedCharacters;
+            if (m_pushedChar2)
+                ++numberOfPushedCharacters;
+        }
+        return m_numberOfCharactersConsumedPriorToCurrentString + m_currentString.numberOfCharactersConsumed() - numberOfPushedCharacters;
     }
 
-    int numberOfCharactersConsumedSlow() const;
-
     String toString() const;
 
     const UChar& operator*() const { return *current(); }