2 * Copyright (C) 2004-2014 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "TextIterator.h"
31 #include "FontCascade.h"
33 #include "HTMLBodyElement.h"
34 #include "HTMLElement.h"
35 #include "HTMLFrameOwnerElement.h"
36 #include "HTMLInputElement.h"
37 #include "HTMLLegendElement.h"
38 #include "HTMLMeterElement.h"
39 #include "HTMLNames.h"
40 #include "HTMLParagraphElement.h"
41 #include "HTMLProgressElement.h"
42 #include "HTMLSlotElement.h"
43 #include "HTMLTextAreaElement.h"
44 #include "HTMLTextFormControlElement.h"
45 #include "InlineTextBox.h"
46 #include "NodeTraversal.h"
47 #include "RenderImage.h"
48 #include "RenderIterator.h"
49 #include "RenderTableCell.h"
50 #include "RenderTableRow.h"
51 #include "RenderTextControl.h"
52 #include "RenderTextFragment.h"
53 #include "ShadowRoot.h"
54 #include "SimpleLineLayout.h"
55 #include "SimpleLineLayoutResolver.h"
56 #include "TextBoundaries.h"
57 #include <wtf/text/TextBreakIterator.h>
58 #include "TextControlInnerElements.h"
59 #include "VisiblePosition.h"
60 #include "VisibleUnits.h"
61 #include "htmlediting.h"
62 #include <wtf/text/CString.h>
63 #include <wtf/text/StringBuilder.h>
64 #include <wtf/unicode/CharacterNames.h>
66 #if !UCONFIG_NO_COLLATION
67 #include <unicode/usearch.h>
68 #include <wtf/text/TextBreakIteratorInternalICU.h>
71 using namespace WTF::Unicode;
75 using namespace HTMLNames;
77 // Buffer that knows how to compare with a search target.
78 // Keeps enough of the previous text to be able to search in the future, but no more.
79 // Non-breaking spaces are always equal to normal spaces.
80 // Case folding is also done if the CaseInsensitive option is specified.
81 // Matches are further filtered if the AtWordStarts option is specified, although some
82 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
84 WTF_MAKE_NONCOPYABLE(SearchBuffer);
86 SearchBuffer(const String& target, FindOptions);
89 // Returns number of characters appended; guaranteed to be in the range [1, length].
90 size_t append(StringView);
91 bool needsMoreContext() const;
92 void prependContext(StringView);
95 // Result is the size in characters of what was found.
96 // And <startOffset> is the number of characters back to the start of what was found.
97 size_t search(size_t& startOffset);
100 #if !UCONFIG_NO_COLLATION
103 bool isBadMatch(const UChar*, size_t length) const;
104 bool isWordStartMatch(size_t start, size_t length) const;
105 bool isWordEndMatch(size_t start, size_t length) const;
107 const String m_target;
108 const StringView::UpconvertedCharacters m_targetCharacters;
109 FindOptions m_options;
111 Vector<UChar> m_buffer;
113 size_t m_prefixLength;
115 bool m_needsMoreContext;
117 const bool m_targetRequiresKanaWorkaround;
118 Vector<UChar> m_normalizedTarget;
119 mutable Vector<UChar> m_normalizedMatch;
124 void append(UChar, bool isCharacterStart);
125 size_t length() const;
128 FindOptions m_options;
130 Vector<UChar> m_buffer;
131 Vector<bool> m_isCharacterStartBuffer;
140 static const unsigned bitsInWord = sizeof(unsigned) * 8;
141 static const unsigned bitInWordMask = bitsInWord - 1;
148 BitStack::~BitStack()
152 void BitStack::push(bool bit)
154 unsigned index = m_size / bitsInWord;
155 unsigned shift = m_size & bitInWordMask;
156 if (!shift && index == m_words.size()) {
157 m_words.grow(index + 1);
160 unsigned& word = m_words[index];
161 unsigned mask = 1U << shift;
175 bool BitStack::top() const
179 unsigned shift = (m_size - 1) & bitInWordMask;
180 return m_words.last() & (1U << shift);
183 unsigned BitStack::size() const
190 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
191 static Node* nextInPreOrderCrossingShadowBoundaries(Node& rangeEndContainer, int rangeEndOffset)
193 if (rangeEndOffset >= 0 && !rangeEndContainer.offsetInCharacters()) {
194 if (Node* next = rangeEndContainer.traverseToChildAt(rangeEndOffset))
197 for (Node* node = &rangeEndContainer; node; node = node->parentOrShadowHostNode()) {
198 if (Node* next = node->nextSibling())
204 static inline bool fullyClipsContents(Node& node)
206 auto* renderer = node.renderer();
208 if (!is<Element>(node))
210 return !downcast<Element>(node).hasDisplayContents();
212 if (!is<RenderBox>(*renderer))
214 auto& box = downcast<RenderBox>(*renderer);
215 if (!box.hasOverflowClip())
218 // Quirk to keep copy/paste in the CodeMirror editor version used in Jenkins working.
219 if (is<HTMLTextAreaElement>(node))
220 return box.size().isEmpty();
222 return box.contentSize().isEmpty();
225 static inline bool ignoresContainerClip(Node& node)
227 auto* renderer = node.renderer();
228 if (!renderer || renderer->isTextOrLineBreak())
230 return renderer->style().hasOutOfFlowPosition();
233 static void pushFullyClippedState(BitStack& stack, Node& node)
235 // Push true if this node full clips its contents, or if a parent already has fully
236 // clipped and this is not a node that ignores its container's clip.
237 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
240 static void setUpFullyClippedStack(BitStack& stack, Node& node)
242 // Put the nodes in a vector so we can iterate in reverse order.
243 // FIXME: This (and TextIterator in general) should use ComposedTreeIterator.
244 Vector<Node*, 100> ancestry;
245 for (Node* parent = node.parentOrShadowHostNode(); parent; parent = parent->parentOrShadowHostNode())
246 ancestry.append(parent);
248 // Call pushFullyClippedState on each node starting with the earliest ancestor.
249 size_t size = ancestry.size();
250 for (size_t i = 0; i < size; ++i)
251 pushFullyClippedState(stack, *ancestry[size - i - 1]);
252 pushFullyClippedState(stack, node);
255 static bool isClippedByFrameAncestor(const Document& document, TextIteratorBehavior behavior)
257 if (!(behavior & TextIteratorClipsToFrameAncestors))
260 for (auto* owner = document.ownerElement(); owner; owner = owner->document().ownerElement()) {
261 BitStack ownerClipStack;
262 setUpFullyClippedStack(ownerClipStack, *owner);
263 if (ownerClipStack.top())
269 // FIXME: editingIgnoresContent and isRendererReplacedElement try to do the same job.
270 // It's not good to have both of them.
271 bool isRendererReplacedElement(RenderObject* renderer)
276 bool isAttachment = false;
277 #if ENABLE(ATTACHMENT_ELEMENT)
278 isAttachment = renderer->isAttachment();
280 if (renderer->isImage() || renderer->isWidget() || renderer->isMedia() || isAttachment)
283 if (is<Element>(renderer->node())) {
284 Element& element = downcast<Element>(*renderer->node());
285 if (is<HTMLFormControlElement>(element) || is<HTMLLegendElement>(element) || is<HTMLProgressElement>(element) || element.hasTagName(meterTag))
287 if (equalLettersIgnoringASCIICase(element.attributeWithoutSynchronization(roleAttr), "img"))
296 inline void TextIteratorCopyableText::reset()
298 m_singleCharacter = 0;
304 inline void TextIteratorCopyableText::set(String&& string)
306 m_singleCharacter = 0;
307 m_string = WTFMove(string);
309 m_length = m_string.length();
312 inline void TextIteratorCopyableText::set(String&& string, unsigned offset, unsigned length)
314 ASSERT(offset < string.length());
316 ASSERT(length <= string.length() - offset);
318 m_singleCharacter = 0;
319 m_string = WTFMove(string);
324 inline void TextIteratorCopyableText::set(UChar singleCharacter)
326 m_singleCharacter = singleCharacter;
332 void TextIteratorCopyableText::appendToStringBuilder(StringBuilder& builder) const
334 if (m_singleCharacter)
335 builder.append(m_singleCharacter);
337 builder.append(m_string, m_offset, m_length);
342 TextIterator::TextIterator(const Range* range, TextIteratorBehavior behavior)
343 : m_behavior(behavior)
345 // FIXME: Only m_positionNode above needs to be initialized if range is null.
349 range->ownerDocument().updateLayoutIgnorePendingStylesheets();
351 m_startContainer = &range->startContainer();
353 // Callers should be handing us well-formed ranges. If we discover that this isn't
354 // the case, we could consider changing this assertion to an early return.
355 ASSERT(range->boundaryPointsValid());
357 m_startOffset = range->startOffset();
358 m_endContainer = &range->endContainer();
359 m_endOffset = range->endOffset();
361 // Set up the current node for processing.
362 m_node = range->firstNode();
366 if (isClippedByFrameAncestor(m_node->document(), m_behavior))
369 setUpFullyClippedStack(m_fullyClippedStack, *m_node);
371 m_offset = m_node == m_startContainer ? m_startOffset : 0;
373 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(*m_endContainer, m_endOffset);
375 m_positionNode = m_node;
380 TextIterator::~TextIterator()
384 void TextIterator::advance()
388 // reset the run information
389 m_positionNode = nullptr;
390 m_copyableText.reset();
391 m_text = StringView();
393 // handle remembered node that needed a newline after the text node's newline
394 if (m_nodeForAdditionalNewline) {
395 // Emit the extra newline, and position it *inside* m_node, after m_node's
396 // contents, in case it's a block, in the same way that we position the first
397 // newline. The range for the emitted newline should start where the line
399 // FIXME: It would be cleaner if we emitted two newlines during the last
400 // iteration, instead of using m_needsAnotherNewline.
401 emitCharacter('\n', *m_nodeForAdditionalNewline->parentNode(), m_nodeForAdditionalNewline, 1, 1);
402 m_nodeForAdditionalNewline = nullptr;
406 if (!m_textBox && m_remainingTextBox) {
407 m_textBox = m_remainingTextBox;
408 m_remainingTextBox = nullptr;
409 m_firstLetterText = nullptr;
412 // handle remembered text box
419 while (m_node && m_node != m_pastEndNode) {
420 // if the range ends at offset 0 of an element, represent the
421 // position, but not the content, of that element e.g. if the
422 // node is a blockflow element, emit a newline that
423 // precedes the element
424 if (m_node == m_endContainer && !m_endOffset) {
425 representNodeOffsetZero();
430 auto* renderer = m_node->renderer();
432 m_handledNode = true;
433 m_handledChildren = true;
435 // handle current node according to its type
436 if (!m_handledNode) {
437 if (renderer->isText() && m_node->isTextNode())
438 m_handledNode = handleTextNode();
439 else if (isRendererReplacedElement(renderer))
440 m_handledNode = handleReplacedElement();
442 m_handledNode = handleNonTextNode();
448 // find a new current node to handle in depth-first manner,
449 // calling exitNode() as we come back thru a parent node
450 Node* next = m_handledChildren ? nullptr : m_node->firstChild();
453 next = m_node->nextSibling();
455 bool pastEnd = NodeTraversal::next(*m_node) == m_pastEndNode;
456 Node* parentNode = m_node->parentOrShadowHostNode();
457 while (!next && parentNode) {
458 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(*parentNode))
460 bool haveRenderer = m_node->renderer();
461 Node* exitedNode = m_node;
463 m_fullyClippedStack.pop();
464 parentNode = m_node->parentOrShadowHostNode();
466 exitNode(exitedNode);
467 if (m_positionNode) {
468 m_handledNode = true;
469 m_handledChildren = true;
472 next = m_node->nextSibling();
475 m_fullyClippedStack.pop();
478 // set the new current node
481 pushFullyClippedState(m_fullyClippedStack, *m_node);
482 m_handledNode = false;
483 m_handledChildren = false;
484 m_handledFirstLetter = false;
485 m_firstLetterText = nullptr;
487 // how would this ever be?
493 static bool hasVisibleTextNode(RenderText& renderer)
495 if (renderer.style().visibility() == VISIBLE)
497 if (is<RenderTextFragment>(renderer)) {
498 if (auto firstLetter = downcast<RenderTextFragment>(renderer).firstLetter()) {
499 if (firstLetter->style().visibility() == VISIBLE)
506 static unsigned textNodeOffsetInFlow(const Text& firstTextNodeInRange)
508 // Calculate the text offset for simple lines.
509 RenderObject* renderer = firstTextNodeInRange.renderer();
512 unsigned textOffset = 0;
513 for (renderer = renderer->previousSibling(); renderer; renderer = renderer->previousSibling()) {
514 if (is<RenderText>(renderer))
515 textOffset += downcast<RenderText>(renderer)->textLength();
520 static bool isNewLineOrTabCharacter(UChar character)
522 return character == '\n' || character == '\t';
525 bool TextIterator::handleTextNode()
527 Text& textNode = downcast<Text>(*m_node);
529 if (m_fullyClippedStack.top() && !(m_behavior & TextIteratorIgnoresStyleVisibility))
532 auto& renderer = *textNode.renderer();
533 m_lastTextNode = &textNode;
534 String rendererText = renderer.text();
536 // handle pre-formatted text
537 if (!renderer.style().collapseWhiteSpace()) {
538 int runStart = m_offset;
539 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
540 emitCharacter(' ', textNode, nullptr, runStart, runStart);
543 if (!m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset) {
544 handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer));
545 if (m_firstLetterText) {
546 String firstLetter = m_firstLetterText->text();
547 emitText(textNode, *m_firstLetterText, m_offset, m_offset + firstLetter.length());
548 m_firstLetterText = nullptr;
553 if (renderer.style().visibility() != VISIBLE && !(m_behavior & TextIteratorIgnoresStyleVisibility))
555 int rendererTextLength = rendererText.length();
556 int end = (&textNode == m_endContainer) ? m_endOffset : INT_MAX;
557 int runEnd = std::min(rendererTextLength, end);
559 if (runStart >= runEnd)
562 emitText(textNode, renderer, runStart, runEnd);
566 if (const auto* layout = renderer.simpleLineLayout()) {
567 if (renderer.style().visibility() != VISIBLE && !(m_behavior & TextIteratorIgnoresStyleVisibility))
569 ASSERT(renderer.parent());
570 ASSERT(is<RenderBlockFlow>(*renderer.parent()));
571 const auto& blockFlow = downcast<RenderBlockFlow>(*renderer.parent());
572 // Use the simple layout runs to iterate over the text content.
573 bool isNewTextNode = m_previousSimpleTextNodeInFlow && m_previousSimpleTextNodeInFlow != &textNode;
574 // Simple line layout run positions are all absolute to the parent flow.
575 // Offsetting is required when multiple renderers are present.
576 m_accumulatedSimpleTextLengthInFlow += isNewTextNode ? m_previousSimpleTextNodeInFlow->renderer()->text()->length() : 0;
577 m_previousSimpleTextNodeInFlow = &textNode;
579 unsigned endPosition = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : rendererText.length();
580 if (!m_flowRunResolverCache || &m_flowRunResolverCache->flow() != &blockFlow) {
581 m_accumulatedSimpleTextLengthInFlow = m_flowRunResolverCache ? 0 : textNodeOffsetInFlow(textNode);
582 m_flowRunResolverCache = std::make_unique<SimpleLineLayout::RunResolver>(blockFlow, *layout);
584 // Skip to m_offset position.
585 auto range = m_flowRunResolverCache->rangeForRenderer(renderer);
586 auto it = range.begin();
587 auto end = range.end();
588 while (it != end && (*it).end() <= (static_cast<unsigned>(m_offset) + m_accumulatedSimpleTextLengthInFlow))
590 if (m_nextRunNeedsWhitespace && rendererText[m_offset - 1] == '\n') {
591 emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1);
595 // Collapsed trailing whitespace.
596 m_offset = endPosition;
597 m_lastTextNodeEndedWithCollapsedSpace = true;
600 if (m_nextRunNeedsWhitespace) {
601 emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1);
604 const auto run = *it;
605 ASSERT(run.end() - run.start() <= rendererText.length());
606 // contentStart skips leading whitespace.
607 unsigned contentStart = std::max<unsigned>(m_offset, run.start() - m_accumulatedSimpleTextLengthInFlow);
608 unsigned contentEnd = std::min(endPosition, run.end() - m_accumulatedSimpleTextLengthInFlow);
609 ASSERT_WITH_SECURITY_IMPLICATION(contentStart <= contentEnd);
610 // Check if whitespace adjustment is needed when crossing renderer boundary.
612 bool lastCharacterIsNotWhitespace = m_lastCharacter && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter);
613 bool addTrailingWhitespaceForPrevious = m_lastTextNodeEndedWithCollapsedSpace && lastCharacterIsNotWhitespace;
614 bool leadingWhitespaceIsNeededForCurrent = contentStart > static_cast<unsigned>(m_offset) && lastCharacterIsNotWhitespace;
615 if (addTrailingWhitespaceForPrevious || leadingWhitespaceIsNeededForCurrent) {
616 emitCharacter(' ', textNode, nullptr, m_offset, m_offset + 1);
620 // \n \t single whitespace characters need replacing so that the new line/tab characters don't show up.
621 unsigned stopPosition = contentStart;
622 while (stopPosition < contentEnd && !isNewLineOrTabCharacter(rendererText[stopPosition]))
624 // Emit the text up to the new line/tab character.
625 if (stopPosition < contentEnd) {
626 if (stopPosition == contentStart) {
627 emitCharacter(' ', textNode, nullptr, contentStart, contentStart + 1);
628 m_offset = contentStart + 1;
631 emitText(textNode, renderer, contentStart, stopPosition);
632 m_offset = stopPosition + 1;
633 m_nextRunNeedsWhitespace = true;
636 emitText(textNode, renderer, contentStart, contentEnd);
637 // When line ending with collapsed whitespace is present, we need to carry over one whitespace: foo(end of line)bar -> foo bar (otherwise we would end up with foobar).
638 m_nextRunNeedsWhitespace = run.isEndOfLine() && contentEnd < endPosition && renderer.style().isCollapsibleWhiteSpace(rendererText[contentEnd]);
639 m_offset = contentEnd;
640 return static_cast<unsigned>(m_offset) == endPosition;
643 if (renderer.firstTextBox())
644 m_textBox = renderer.firstTextBox();
646 bool shouldHandleFirstLetter = !m_handledFirstLetter && is<RenderTextFragment>(renderer) && !m_offset;
647 if (shouldHandleFirstLetter)
648 handleTextNodeFirstLetter(downcast<RenderTextFragment>(renderer));
650 if (!renderer.firstTextBox() && rendererText.length() && !shouldHandleFirstLetter) {
651 if (renderer.style().visibility() != VISIBLE && !(m_behavior & TextIteratorIgnoresStyleVisibility))
653 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
657 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
658 auto& boxesRenderer = m_firstLetterText ? *m_firstLetterText : renderer;
659 if (boxesRenderer.containsReversedText()) {
660 m_sortedTextBoxes.clear();
661 for (InlineTextBox* textBox = boxesRenderer.firstTextBox(); textBox; textBox = textBox->nextTextBox())
662 m_sortedTextBoxes.append(textBox);
663 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
664 m_sortedTextBoxesPosition = 0;
665 m_textBox = m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes[0];
672 void TextIterator::handleTextBox()
674 Text& textNode = downcast<Text>(*m_node);
676 auto& renderer = m_firstLetterText ? *m_firstLetterText : *textNode.renderer();
677 if (renderer.style().visibility() != VISIBLE && !(m_behavior & TextIteratorIgnoresStyleVisibility)) {
681 String rendererText = renderer.text();
682 unsigned start = m_offset;
683 unsigned end = (&textNode == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
685 unsigned textBoxStart = m_textBox->start();
686 unsigned runStart = std::max(textBoxStart, start);
688 // Check for collapsed space at the start of this run.
689 InlineTextBox* firstTextBox = renderer.containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? nullptr : m_sortedTextBoxes[0]) : renderer.firstTextBox();
690 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace || (m_textBox == firstTextBox && textBoxStart == runStart && runStart);
691 if (needSpace && !renderer.style().isCollapsibleWhiteSpace(m_lastCharacter) && m_lastCharacter) {
692 if (m_lastTextNode == &textNode && runStart && rendererText[runStart - 1] == ' ') {
693 unsigned spaceRunStart = runStart - 1;
694 while (spaceRunStart && rendererText[spaceRunStart - 1] == ' ')
696 emitText(textNode, renderer, spaceRunStart, spaceRunStart + 1);
698 emitCharacter(' ', textNode, nullptr, runStart, runStart);
701 unsigned textBoxEnd = textBoxStart + m_textBox->len();
702 unsigned runEnd = std::min(textBoxEnd, end);
704 // Determine what the next text box will be, but don't advance yet
705 InlineTextBox* nextTextBox = nullptr;
706 if (renderer.containsReversedText()) {
707 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
708 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
710 nextTextBox = m_textBox->nextTextBox();
711 ASSERT(!nextTextBox || &nextTextBox->renderer() == &renderer);
713 if (runStart < runEnd) {
714 // Handle either a single newline character (which becomes a space),
715 // or a run of characters that does not include a newline.
716 // This effectively translates newlines to spaces without copying the text.
717 if (rendererText[runStart] == '\n') {
718 emitCharacter(' ', textNode, nullptr, runStart, runStart + 1);
719 m_offset = runStart + 1;
721 size_t subrunEnd = rendererText.find('\n', runStart);
722 if (subrunEnd == notFound || subrunEnd > runEnd) {
724 bool lastSpaceCollapsedByNextNonTextBox = !nextTextBox && (m_behavior & TextIteratorBehavesAsIfNodesFollowing) && rendererText.length() > runEnd;
725 if (lastSpaceCollapsedByNextNonTextBox)
726 ++subrunEnd; // runEnd stopped before last space. Increment by one to restore the space.
728 m_offset = subrunEnd;
729 emitText(textNode, renderer, runStart, subrunEnd);
732 // If we are doing a subrun that doesn't go to the end of the text box,
733 // come back again to finish handling this text box; don't advance to the next one.
734 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
737 // Advance and return
738 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : rendererText.length();
739 if (nextRunStart > runEnd)
740 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
741 m_textBox = nextTextBox;
742 if (renderer.containsReversedText())
743 ++m_sortedTextBoxesPosition;
746 // Advance and continue
747 m_textBox = nextTextBox;
748 if (renderer.containsReversedText())
749 ++m_sortedTextBoxesPosition;
751 if (!m_textBox && m_remainingTextBox) {
752 m_textBox = m_remainingTextBox;
753 m_remainingTextBox = nullptr;
754 m_firstLetterText = nullptr;
760 static inline RenderText* firstRenderTextInFirstLetter(RenderBoxModelObject* firstLetter)
765 // FIXME: Should this check descendent objects?
766 return childrenOfType<RenderText>(*firstLetter).first();
769 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment& renderer)
771 if (auto* firstLetter = renderer.firstLetter()) {
772 if (firstLetter->style().visibility() != VISIBLE && !(m_behavior & TextIteratorIgnoresStyleVisibility))
774 if (auto* firstLetterText = firstRenderTextInFirstLetter(firstLetter)) {
775 m_handledFirstLetter = true;
776 m_remainingTextBox = m_textBox;
777 m_textBox = firstLetterText->firstTextBox();
778 m_sortedTextBoxes.clear();
779 m_firstLetterText = firstLetterText;
782 m_handledFirstLetter = true;
785 bool TextIterator::handleReplacedElement()
787 if (m_fullyClippedStack.top())
790 auto& renderer = *m_node->renderer();
791 if (renderer.style().visibility() != VISIBLE && !(m_behavior & TextIteratorIgnoresStyleVisibility))
794 if (m_lastTextNodeEndedWithCollapsedSpace) {
795 emitCharacter(' ', *m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
799 if ((m_behavior & TextIteratorEntersTextControls) && is<RenderTextControl>(renderer)) {
800 if (TextControlInnerTextElement* innerTextElement = downcast<RenderTextControl>(renderer).textFormControlElement().innerTextElement()) {
801 m_node = innerTextElement->containingShadowRoot();
802 pushFullyClippedState(m_fullyClippedStack, *m_node);
810 if ((m_behavior & TextIteratorEmitsObjectReplacementCharacters) && renderer.isReplaced()) {
811 emitCharacter(objectReplacementCharacter, *m_node->parentNode(), m_node, 0, 1);
812 // Don't process subtrees for embedded objects. If the text there is required,
813 // it must be explicitly asked by specifying a range falling inside its boundaries.
814 m_handledChildren = true;
818 if (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) {
819 // We want replaced elements to behave like punctuation for boundary
820 // finding, and to simply take up space for the selection preservation
821 // code in moveParagraphs, so we use a comma.
822 emitCharacter(',', *m_node->parentNode(), m_node, 0, 1);
826 m_positionNode = m_node->parentNode();
827 m_positionOffsetBaseNode = m_node;
828 m_positionStartOffset = 0;
829 m_positionEndOffset = 1;
831 if ((m_behavior & TextIteratorEmitsImageAltText) && is<RenderImage>(renderer)) {
832 String altText = downcast<RenderImage>(renderer).altText();
833 if (unsigned length = altText.length()) {
834 m_lastCharacter = altText[length - 1];
835 m_copyableText.set(WTFMove(altText));
836 m_text = m_copyableText.text();
841 m_copyableText.reset();
842 m_text = StringView();
847 static bool shouldEmitTabBeforeNode(Node& node)
849 auto* renderer = node.renderer();
851 // Table cells are delimited by tabs.
852 if (!renderer || !isTableCell(&node))
855 // Want a tab before every cell other than the first one.
856 RenderTableCell& cell = downcast<RenderTableCell>(*renderer);
857 RenderTable* table = cell.table();
858 return table && (table->cellBefore(&cell) || table->cellAbove(&cell));
861 static bool shouldEmitNewlineForNode(Node* node, bool emitsOriginalText)
863 auto* renderer = node->renderer();
864 if (!(renderer ? renderer->isBR() : node->hasTagName(brTag)))
866 return emitsOriginalText || !(node->isInShadowTree() && is<HTMLInputElement>(*node->shadowHost()));
869 static bool hasHeaderTag(HTMLElement& element)
871 return element.hasTagName(h1Tag)
872 || element.hasTagName(h2Tag)
873 || element.hasTagName(h3Tag)
874 || element.hasTagName(h4Tag)
875 || element.hasTagName(h5Tag)
876 || element.hasTagName(h6Tag);
879 static bool shouldEmitNewlinesBeforeAndAfterNode(Node& node)
881 // Block flow (versus inline flow) is represented by having
882 // a newline both before and after the element.
883 auto* renderer = node.renderer();
885 if (!is<HTMLElement>(node))
887 auto& element = downcast<HTMLElement>(node);
888 return hasHeaderTag(element)
889 || element.hasTagName(blockquoteTag)
890 || element.hasTagName(ddTag)
891 || element.hasTagName(divTag)
892 || element.hasTagName(dlTag)
893 || element.hasTagName(dtTag)
894 || element.hasTagName(hrTag)
895 || element.hasTagName(liTag)
896 || element.hasTagName(listingTag)
897 || element.hasTagName(olTag)
898 || element.hasTagName(pTag)
899 || element.hasTagName(preTag)
900 || element.hasTagName(trTag)
901 || element.hasTagName(ulTag);
904 // Need to make an exception for table cells, because they are blocks, but we
905 // want them tab-delimited rather than having newlines before and after.
906 if (isTableCell(&node))
909 // Need to make an exception for table row elements, because they are neither
910 // "inline" or "RenderBlock", but we want newlines for them.
911 if (is<RenderTableRow>(*renderer)) {
912 RenderTable* table = downcast<RenderTableRow>(*renderer).table();
913 if (table && !table->isInline())
917 return !renderer->isInline()
918 && is<RenderBlock>(*renderer)
919 && !renderer->isFloatingOrOutOfFlowPositioned()
920 && !renderer->isBody()
921 && !renderer->isRubyText();
924 static bool shouldEmitNewlineAfterNode(Node& node)
926 // FIXME: It should be better but slower to create a VisiblePosition here.
927 if (!shouldEmitNewlinesBeforeAndAfterNode(node))
929 // Check if this is the very last renderer in the document.
930 // If so, then we should not emit a newline.
931 Node* subsequentNode = &node;
932 while ((subsequentNode = NodeTraversal::nextSkippingChildren(*subsequentNode))) {
933 if (subsequentNode->renderer())
939 static bool shouldEmitNewlineBeforeNode(Node& node)
941 return shouldEmitNewlinesBeforeAndAfterNode(node);
944 static bool shouldEmitExtraNewlineForNode(Node& node)
946 // When there is a significant collapsed bottom margin, emit an extra
947 // newline for a more realistic result. We end up getting the right
948 // result even without margin collapsing. For example: <div><p>text</p></div>
949 // will work right even if both the <div> and the <p> have bottom margins.
951 auto* renderer = node.renderer();
952 if (!is<RenderBox>(renderer))
955 // NOTE: We only do this for a select set of nodes, and WinIE appears not to do this at all.
956 if (!is<HTMLElement>(node))
959 HTMLElement& element = downcast<HTMLElement>(node);
960 if (!hasHeaderTag(element) && !is<HTMLParagraphElement>(element))
963 int bottomMargin = downcast<RenderBox>(*renderer).collapsedMarginAfter();
964 int fontSize = downcast<RenderBox>(*renderer).style().fontDescription().computedPixelSize();
965 return bottomMargin * 2 >= fontSize;
968 static int collapsedSpaceLength(RenderText& renderer, int textEnd)
970 StringImpl& text = *renderer.text();
971 unsigned length = text.length();
972 for (unsigned i = textEnd; i < length; ++i) {
973 if (!renderer.style().isCollapsibleWhiteSpace(text[i]))
976 return length - textEnd;
979 static int maxOffsetIncludingCollapsedSpaces(Node& node)
981 int offset = caretMaxOffset(node);
982 if (auto* renderer = node.renderer()) {
983 if (is<RenderText>(*renderer))
984 offset += collapsedSpaceLength(downcast<RenderText>(*renderer), offset);
989 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
990 bool TextIterator::shouldRepresentNodeOffsetZero()
992 if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isTable())
995 // Leave element positioned flush with start of a paragraph
996 // (e.g. do not insert tab before a table cell at the start of a paragraph)
997 if (m_lastCharacter == '\n')
1000 // Otherwise, show the position if we have emitted any characters
1004 // We've not emitted anything yet. Generally, there is no need for any positioning then.
1005 // The only exception is when the element is visually not in the same line as
1006 // the start of the range (e.g. the range starts at the end of the previous paragraph).
1007 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
1008 // make quicker checks to possibly avoid that. Another check that we could make is
1009 // is whether the inline vs block flow changed since the previous visible element.
1010 // I think we're already in a special enough case that that won't be needed, tho.
1012 // No character needed if this is the first node in the range.
1013 if (m_node == m_startContainer)
1016 // If we are outside the start container's subtree, assume we need to emit.
1017 // FIXME: m_startContainer could be an inline block
1018 if (!m_node->isDescendantOf(m_startContainer))
1021 // If we started as m_startContainer offset 0 and the current node is a descendant of
1022 // the start container, we already had enough context to correctly decide whether to
1023 // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
1024 // so don't second guess that now.
1025 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
1026 // immaterial since we likely would have already emitted something by now.
1027 if (m_startOffset == 0)
1030 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
1031 // Additionally, if the range we are iterating over contains huge sections of unrendered content,
1032 // we would create VisiblePositions on every call to this function without this check.
1033 if (!m_node->renderer() || m_node->renderer()->style().visibility() != VISIBLE
1034 || (is<RenderBlockFlow>(*m_node->renderer()) && !downcast<RenderBlockFlow>(*m_node->renderer()).height() && !is<HTMLBodyElement>(*m_node)))
1037 // The startPos.isNotNull() check is needed because the start could be before the body,
1038 // and in that case we'll get null. We don't want to put in newlines at the start in that case.
1039 // The currPos.isNotNull() check is needed because positions in non-HTML content
1040 // (like SVG) do not have visible positions, and we don't want to emit for them either.
1041 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
1042 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
1043 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
1046 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node& node)
1048 return node.renderer() && node.renderer()->isTable() && (node.renderer()->isInline() || (m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions));
1051 void TextIterator::representNodeOffsetZero()
1053 // Emit a character to show the positioning of m_node.
1055 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
1056 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks
1057 // on m_node to see if it necessitates emitting a character first and will early return
1058 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
1059 if (shouldEmitTabBeforeNode(*m_node)) {
1060 if (shouldRepresentNodeOffsetZero())
1061 emitCharacter('\t', *m_node->parentNode(), m_node, 0, 0);
1062 } else if (shouldEmitNewlineBeforeNode(*m_node)) {
1063 if (shouldRepresentNodeOffsetZero())
1064 emitCharacter('\n', *m_node->parentNode(), m_node, 0, 0);
1065 } else if (shouldEmitSpaceBeforeAndAfterNode(*m_node)) {
1066 if (shouldRepresentNodeOffsetZero())
1067 emitCharacter(' ', *m_node->parentNode(), m_node, 0, 0);
1071 bool TextIterator::handleNonTextNode()
1073 if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText))
1074 emitCharacter('\n', *m_node->parentNode(), m_node, 0, 1);
1075 else if ((m_behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions) && m_node->renderer() && m_node->renderer()->isHR())
1076 emitCharacter(' ', *m_node->parentNode(), m_node, 0, 1);
1078 representNodeOffsetZero();
1083 void TextIterator::exitNode(Node* exitedNode)
1085 // prevent emitting a newline when exiting a collapsed block at beginning of the range
1086 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
1087 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
1088 // therefore look like a blank line.
1092 // Emit with a position *inside* m_node, after m_node's contents, in
1093 // case it is a block, because the run should start where the
1094 // emitted character is positioned visually.
1095 Node* baseNode = exitedNode;
1096 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
1097 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
1098 // TextIterator in _web_attributedStringFromRange.
1099 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
1100 if (m_lastTextNode && shouldEmitNewlineAfterNode(*m_node)) {
1101 // use extra newline to represent margin bottom, as needed
1102 bool addNewline = shouldEmitExtraNewlineForNode(*m_node);
1104 // FIXME: We need to emit a '\n' as we leave an empty block(s) that
1105 // contain a VisiblePosition when doing selection preservation.
1106 if (m_lastCharacter != '\n') {
1107 // insert a newline with a position following this block's contents.
1108 emitCharacter('\n', *baseNode->parentNode(), baseNode, 1, 1);
1109 // remember whether to later add a newline for the current node
1110 ASSERT(!m_nodeForAdditionalNewline);
1112 m_nodeForAdditionalNewline = baseNode;
1113 } else if (addNewline)
1114 // insert a newline with a position following this block's contents.
1115 emitCharacter('\n', *baseNode->parentNode(), baseNode, 1, 1);
1118 // If nothing was emitted, see if we need to emit a space.
1119 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(*m_node))
1120 emitCharacter(' ', *baseNode->parentNode(), baseNode, 1, 1);
1123 void TextIterator::emitCharacter(UChar character, Node& characterNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
1125 m_hasEmitted = true;
1127 // remember information with which to construct the TextIterator::range()
1128 m_positionNode = &characterNode;
1129 m_positionOffsetBaseNode = offsetBaseNode;
1130 m_positionStartOffset = textStartOffset;
1131 m_positionEndOffset = textEndOffset;
1133 m_copyableText.set(character);
1134 m_text = m_copyableText.text();
1135 m_lastCharacter = character;
1136 m_lastTextNodeEndedWithCollapsedSpace = false;
1137 m_nextRunNeedsWhitespace = false;
1140 void TextIterator::emitText(Text& textNode, RenderText& renderer, int textStartOffset, int textEndOffset)
1142 ASSERT(textStartOffset >= 0);
1143 ASSERT(textEndOffset >= 0);
1144 ASSERT(textStartOffset <= textEndOffset);
1146 // FIXME: This probably yields the wrong offsets when text-transform: lowercase turns a single character into two characters.
1147 String string = (m_behavior & TextIteratorEmitsOriginalText) ? renderer.originalText()
1148 : ((m_behavior & TextIteratorEmitsTextsWithoutTranscoding) ? renderer.textWithoutConvertingBackslashToYenSymbol() : renderer.text());
1150 ASSERT(string.length() >= static_cast<unsigned>(textEndOffset));
1152 m_positionNode = &textNode;
1153 m_positionOffsetBaseNode = nullptr;
1154 m_positionStartOffset = textStartOffset;
1155 m_positionEndOffset = textEndOffset;
1157 m_lastCharacter = string[textEndOffset - 1];
1158 m_copyableText.set(WTFMove(string), textStartOffset, textEndOffset - textStartOffset);
1159 m_text = m_copyableText.text();
1161 m_lastTextNodeEndedWithCollapsedSpace = false;
1162 m_nextRunNeedsWhitespace = false;
1163 m_hasEmitted = true;
1166 Ref<Range> TextIterator::range() const
1170 // use the current run information, if we have it
1171 if (m_positionOffsetBaseNode) {
1172 unsigned index = m_positionOffsetBaseNode->computeNodeIndex();
1173 m_positionStartOffset += index;
1174 m_positionEndOffset += index;
1175 m_positionOffsetBaseNode = nullptr;
1177 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1180 Node* TextIterator::node() const
1182 Ref<Range> textRange = range();
1184 Node& node = textRange->startContainer();
1185 if (node.offsetInCharacters())
1188 return node.traverseToChildAt(textRange->startOffset());
1193 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range& range)
1195 range.ownerDocument().updateLayoutIgnorePendingStylesheets();
1197 Node* startNode = &range.startContainer();
1198 Node* endNode = &range.endContainer();
1199 int startOffset = range.startOffset();
1200 int endOffset = range.endOffset();
1202 if (!startNode->offsetInCharacters()) {
1203 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->countChildNodes())) {
1204 startNode = startNode->traverseToChildAt(startOffset);
1208 if (!endNode->offsetInCharacters()) {
1209 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->countChildNodes())) {
1210 endNode = endNode->traverseToChildAt(endOffset - 1);
1211 endOffset = lastOffsetInNode(endNode);
1216 setUpFullyClippedStack(m_fullyClippedStack, *m_node);
1217 m_offset = endOffset;
1218 m_handledNode = false;
1219 m_handledChildren = endOffset == 0;
1221 m_startContainer = startNode;
1222 m_startOffset = startOffset;
1223 m_endContainer = endNode;
1224 m_endOffset = endOffset;
1226 m_positionNode = endNode;
1228 m_lastTextNode = nullptr;
1229 m_lastCharacter = '\n';
1231 m_havePassedStartContainer = false;
1236 void SimplifiedBackwardsTextIterator::advance()
1240 m_positionNode = nullptr;
1241 m_copyableText.reset();
1242 m_text = StringView();
1244 while (m_node && !m_havePassedStartContainer) {
1245 // Don't handle node if we start iterating at [node, 0].
1246 if (!m_handledNode && !(m_node == m_endContainer && !m_endOffset)) {
1247 auto* renderer = m_node->renderer();
1248 if (renderer && renderer->isText() && m_node->isTextNode()) {
1249 if (renderer->style().visibility() == VISIBLE && m_offset > 0)
1250 m_handledNode = handleTextNode();
1251 } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1252 if (renderer->style().visibility() == VISIBLE && m_offset > 0)
1253 m_handledNode = handleReplacedElement();
1255 m_handledNode = handleNonTextNode();
1260 if (!m_handledChildren && m_node->hasChildNodes()) {
1261 m_node = m_node->lastChild();
1262 pushFullyClippedState(m_fullyClippedStack, *m_node);
1264 // Exit empty containers as we pass over them or containers
1265 // where [container, 0] is where we started iterating.
1266 if (!m_handledNode && canHaveChildrenForEditing(*m_node) && m_node->parentNode() && (!m_node->lastChild() || (m_node == m_endContainer && !m_endOffset))) {
1268 if (m_positionNode) {
1269 m_handledNode = true;
1270 m_handledChildren = true;
1275 // Exit all other containers.
1276 while (!m_node->previousSibling()) {
1277 if (!advanceRespectingRange(m_node->parentOrShadowHostNode()))
1279 m_fullyClippedStack.pop();
1281 if (m_positionNode) {
1282 m_handledNode = true;
1283 m_handledChildren = true;
1288 m_fullyClippedStack.pop();
1289 if (advanceRespectingRange(m_node->previousSibling()))
1290 pushFullyClippedState(m_fullyClippedStack, *m_node);
1295 // For the purpose of word boundary detection,
1296 // we should iterate all visible text and trailing (collapsed) whitespaces.
1297 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(*m_node) : 0;
1298 m_handledNode = false;
1299 m_handledChildren = false;
1306 bool SimplifiedBackwardsTextIterator::handleTextNode()
1308 Text& textNode = downcast<Text>(*m_node);
1310 m_lastTextNode = &textNode;
1314 RenderText* renderer = handleFirstLetter(startOffset, offsetInNode);
1318 String text = renderer->text();
1319 if (!renderer->hasRenderedText() && text.length())
1322 if (startOffset + offsetInNode == m_offset) {
1323 ASSERT(!m_shouldHandleFirstLetter);
1327 m_positionEndOffset = m_offset;
1328 m_offset = startOffset + offsetInNode;
1329 m_positionNode = m_node;
1330 m_positionStartOffset = m_offset;
1332 ASSERT(m_positionStartOffset < m_positionEndOffset);
1333 ASSERT(m_positionStartOffset - offsetInNode >= 0);
1334 ASSERT(m_positionEndOffset - offsetInNode > 0);
1335 ASSERT(m_positionEndOffset - offsetInNode <= static_cast<int>(text.length()));
1337 m_lastCharacter = text[m_positionEndOffset - offsetInNode - 1];
1338 m_copyableText.set(WTFMove(text), m_positionStartOffset - offsetInNode, m_positionEndOffset - m_positionStartOffset);
1339 m_text = m_copyableText.text();
1341 return !m_shouldHandleFirstLetter;
1344 RenderText* SimplifiedBackwardsTextIterator::handleFirstLetter(int& startOffset, int& offsetInNode)
1346 RenderText& renderer = downcast<RenderText>(*m_node->renderer());
1347 startOffset = (m_node == m_startContainer) ? m_startOffset : 0;
1349 if (!is<RenderTextFragment>(renderer)) {
1354 RenderTextFragment& fragment = downcast<RenderTextFragment>(renderer);
1355 int offsetAfterFirstLetter = fragment.start();
1356 if (startOffset >= offsetAfterFirstLetter) {
1357 ASSERT(!m_shouldHandleFirstLetter);
1358 offsetInNode = offsetAfterFirstLetter;
1362 if (!m_shouldHandleFirstLetter && startOffset + offsetAfterFirstLetter < m_offset) {
1363 m_shouldHandleFirstLetter = true;
1364 offsetInNode = offsetAfterFirstLetter;
1368 m_shouldHandleFirstLetter = false;
1370 RenderText* firstLetterRenderer = firstRenderTextInFirstLetter(fragment.firstLetter());
1372 m_offset = firstLetterRenderer->caretMaxOffset();
1373 m_offset += collapsedSpaceLength(*firstLetterRenderer, m_offset);
1375 return firstLetterRenderer;
1378 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1380 unsigned index = m_node->computeNodeIndex();
1381 // We want replaced elements to behave like punctuation for boundary
1382 // finding, and to simply take up space for the selection preservation
1383 // code in moveParagraphs, so we use a comma. Unconditionally emit
1384 // here because this iterator is only used for boundary finding.
1385 emitCharacter(',', *m_node->parentNode(), index, index + 1);
1389 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1391 // We can use a linefeed in place of a tab because this simple iterator is only used to
1392 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
1393 if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) || shouldEmitNewlineAfterNode(*m_node) || shouldEmitTabBeforeNode(*m_node)) {
1394 unsigned index = m_node->computeNodeIndex();
1395 // The start of this emitted range is wrong. Ensuring correctness would require
1396 // VisiblePositions and so would be slow. previousBoundary expects this.
1397 emitCharacter('\n', *m_node->parentNode(), index + 1, index + 1);
1402 void SimplifiedBackwardsTextIterator::exitNode()
1404 if (shouldEmitNewlineForNode(m_node, m_behavior & TextIteratorEmitsOriginalText) || shouldEmitNewlineBeforeNode(*m_node) || shouldEmitTabBeforeNode(*m_node)) {
1405 // The start of this emitted range is wrong. Ensuring correctness would require
1406 // VisiblePositions and so would be slow. previousBoundary expects this.
1407 emitCharacter('\n', *m_node, 0, 0);
1411 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node& node, int startOffset, int endOffset)
1413 m_positionNode = &node;
1414 m_positionStartOffset = startOffset;
1415 m_positionEndOffset = endOffset;
1416 m_copyableText.set(c);
1417 m_text = m_copyableText.text();
1418 m_lastCharacter = c;
1421 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1425 m_havePassedStartContainer |= m_node == m_startContainer;
1426 if (m_havePassedStartContainer)
1432 Ref<Range> SimplifiedBackwardsTextIterator::range() const
1436 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1441 CharacterIterator::CharacterIterator(const Range& range, TextIteratorBehavior behavior)
1442 : m_underlyingIterator(&range, behavior)
1447 while (!atEnd() && !m_underlyingIterator.text().length())
1448 m_underlyingIterator.advance();
1451 Ref<Range> CharacterIterator::range() const
1453 Ref<Range> range = m_underlyingIterator.range();
1454 if (!m_underlyingIterator.atEnd()) {
1455 if (m_underlyingIterator.text().length() <= 1) {
1456 ASSERT(m_runOffset == 0);
1458 Node& node = range->startContainer();
1459 ASSERT(&node == &range->endContainer());
1460 int offset = range->startOffset() + m_runOffset;
1461 range->setStart(node, offset);
1462 range->setEnd(node, offset + 1);
1468 void CharacterIterator::advance(int count)
1477 // easy if there is enough left in the current m_underlyingIterator run
1478 int remaining = m_underlyingIterator.text().length() - m_runOffset;
1479 if (count < remaining) {
1480 m_runOffset += count;
1485 // exhaust the current m_underlyingIterator run
1487 m_offset += remaining;
1489 // move to a subsequent m_underlyingIterator run
1490 for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) {
1491 int runLength = m_underlyingIterator.text().length();
1495 // see whether this is m_underlyingIterator to use
1496 if (count < runLength) {
1497 m_runOffset = count;
1502 // exhaust this m_underlyingIterator run
1504 m_offset += runLength;
1508 // ran to the end of the m_underlyingIterator... no more runs left
1513 static Ref<Range> characterSubrange(Document& document, CharacterIterator& it, int offset, int length)
1517 return Range::create(document);
1519 Ref<Range> start = it.range();
1522 it.advance(length - 1);
1524 return Range::create(document);
1526 Ref<Range> end = it.range();
1528 return Range::create(document, &start->startContainer(), start->startOffset(), &end->endContainer(), end->endOffset());
1531 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range& range)
1532 : m_underlyingIterator(range)
1537 while (!atEnd() && !m_underlyingIterator.text().length())
1538 m_underlyingIterator.advance();
1541 Ref<Range> BackwardsCharacterIterator::range() const
1543 Ref<Range> r = m_underlyingIterator.range();
1544 if (!m_underlyingIterator.atEnd()) {
1545 if (m_underlyingIterator.text().length() <= 1)
1546 ASSERT(m_runOffset == 0);
1548 Node& node = r->startContainer();
1549 ASSERT(&node == &r->endContainer());
1550 int offset = r->endOffset() - m_runOffset;
1551 r->setStart(node, offset - 1);
1552 r->setEnd(node, offset);
1558 void BackwardsCharacterIterator::advance(int count)
1567 int remaining = m_underlyingIterator.text().length() - m_runOffset;
1568 if (count < remaining) {
1569 m_runOffset += count;
1575 m_offset += remaining;
1577 for (m_underlyingIterator.advance(); !atEnd(); m_underlyingIterator.advance()) {
1578 int runLength = m_underlyingIterator.text().length();
1582 if (count < runLength) {
1583 m_runOffset = count;
1589 m_offset += runLength;
1599 WordAwareIterator::WordAwareIterator(const Range& range)
1600 : m_underlyingIterator(&range)
1601 , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1603 advance(); // get in position over the first chunk of text
1606 // We're always in one of these modes:
1607 // - The current chunk in the text iterator is our current chunk
1608 // (typically its a piece of whitespace, or text that ended with whitespace)
1609 // - The previous chunk in the text iterator is our current chunk
1610 // (we looked ahead to the next chunk and found a word boundary)
1611 // - We built up our own chunk of text from many chunks from the text iterator
1613 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1615 void WordAwareIterator::advance()
1617 m_previousText.reset();
1620 // If last time we did a look-ahead, start with that looked-ahead chunk now
1621 if (!m_didLookAhead) {
1622 ASSERT(!m_underlyingIterator.atEnd());
1623 m_underlyingIterator.advance();
1625 m_didLookAhead = false;
1627 // Go to next non-empty chunk
1628 while (!m_underlyingIterator.atEnd() && !m_underlyingIterator.text().length())
1629 m_underlyingIterator.advance();
1630 if (m_underlyingIterator.atEnd())
1634 // If this chunk ends in whitespace we can just use it as our chunk.
1635 if (isSpaceOrNewline(m_underlyingIterator.text()[m_underlyingIterator.text().length() - 1]))
1638 // If this is the first chunk that failed, save it in previousText before look ahead
1639 if (m_buffer.isEmpty())
1640 m_previousText = m_underlyingIterator.copyableText();
1642 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1643 m_underlyingIterator.advance();
1644 if (m_underlyingIterator.atEnd() || !m_underlyingIterator.text().length() || isSpaceOrNewline(m_underlyingIterator.text()[0])) {
1645 m_didLookAhead = true;
1649 if (m_buffer.isEmpty()) {
1650 // Start gobbling chunks until we get to a suitable stopping point
1651 append(m_buffer, m_previousText.text());
1652 m_previousText.reset();
1654 append(m_buffer, m_underlyingIterator.text());
1658 StringView WordAwareIterator::text() const
1660 if (!m_buffer.isEmpty())
1661 return StringView(m_buffer.data(), m_buffer.size());
1662 if (m_previousText.text().length())
1663 return m_previousText.text();
1664 return m_underlyingIterator.text();
1669 static inline UChar foldQuoteMark(UChar c)
1672 case hebrewPunctuationGershayim:
1673 case leftDoubleQuotationMark:
1674 case rightDoubleQuotationMark:
1676 case hebrewPunctuationGeresh:
1677 case leftSingleQuotationMark:
1678 case rightSingleQuotationMark:
1685 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1686 // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1687 // to add tailoring on top of the locale-specific tailoring as of this writing.
1688 static inline String foldQuoteMarks(String string)
1690 string.replace(hebrewPunctuationGeresh, '\'');
1691 string.replace(hebrewPunctuationGershayim, '"');
1692 string.replace(leftDoubleQuotationMark, '"');
1693 string.replace(leftSingleQuotationMark, '\'');
1694 string.replace(rightDoubleQuotationMark, '"');
1695 string.replace(rightSingleQuotationMark, '\'');
1700 #if !UCONFIG_NO_COLLATION
1702 const size_t minimumSearchBufferSize = 8192;
1705 static bool searcherInUse;
1708 static UStringSearch* createSearcher()
1710 // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1711 // but it doesn't matter exactly what it is, since we don't perform any searches
1712 // without setting both the pattern and the text.
1713 UErrorCode status = U_ZERO_ERROR;
1714 String searchCollatorName = makeString(currentSearchLocaleID(), "@collation=search");
1715 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, searchCollatorName.utf8().data(), 0, &status);
1716 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1720 static UStringSearch* searcher()
1722 static UStringSearch* searcher = createSearcher();
1726 static inline void lockSearcher()
1729 ASSERT(!searcherInUse);
1730 searcherInUse = true;
1734 static inline void unlockSearcher()
1737 ASSERT(searcherInUse);
1738 searcherInUse = false;
1742 // ICU's search ignores the distinction between small kana letters and ones
1743 // that are not small, and also characters that differ only in the voicing
1744 // marks when considering only primary collation strength differences.
1745 // This is not helpful for end users, since these differences make words
1746 // distinct, so for our purposes we need these to be considered.
1747 // The Unicode folks do not think the collation algorithm should be
1748 // changed. To work around this, we would like to tailor the ICU searcher,
1749 // but we can't get that to work yet. So instead, we check for cases where
1750 // these differences occur, and skip those matches.
1752 // We refer to the above technique as the "kana workaround". The next few
1753 // functions are helper functinos for the kana workaround.
1755 static inline bool isKanaLetter(UChar character)
1757 // Hiragana letters.
1758 if (character >= 0x3041 && character <= 0x3096)
1761 // Katakana letters.
1762 if (character >= 0x30A1 && character <= 0x30FA)
1764 if (character >= 0x31F0 && character <= 0x31FF)
1767 // Halfwidth katakana letters.
1768 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1774 static inline bool isSmallKanaLetter(UChar character)
1776 ASSERT(isKanaLetter(character));
1778 switch (character) {
1779 case 0x3041: // HIRAGANA LETTER SMALL A
1780 case 0x3043: // HIRAGANA LETTER SMALL I
1781 case 0x3045: // HIRAGANA LETTER SMALL U
1782 case 0x3047: // HIRAGANA LETTER SMALL E
1783 case 0x3049: // HIRAGANA LETTER SMALL O
1784 case 0x3063: // HIRAGANA LETTER SMALL TU
1785 case 0x3083: // HIRAGANA LETTER SMALL YA
1786 case 0x3085: // HIRAGANA LETTER SMALL YU
1787 case 0x3087: // HIRAGANA LETTER SMALL YO
1788 case 0x308E: // HIRAGANA LETTER SMALL WA
1789 case 0x3095: // HIRAGANA LETTER SMALL KA
1790 case 0x3096: // HIRAGANA LETTER SMALL KE
1791 case 0x30A1: // KATAKANA LETTER SMALL A
1792 case 0x30A3: // KATAKANA LETTER SMALL I
1793 case 0x30A5: // KATAKANA LETTER SMALL U
1794 case 0x30A7: // KATAKANA LETTER SMALL E
1795 case 0x30A9: // KATAKANA LETTER SMALL O
1796 case 0x30C3: // KATAKANA LETTER SMALL TU
1797 case 0x30E3: // KATAKANA LETTER SMALL YA
1798 case 0x30E5: // KATAKANA LETTER SMALL YU
1799 case 0x30E7: // KATAKANA LETTER SMALL YO
1800 case 0x30EE: // KATAKANA LETTER SMALL WA
1801 case 0x30F5: // KATAKANA LETTER SMALL KA
1802 case 0x30F6: // KATAKANA LETTER SMALL KE
1803 case 0x31F0: // KATAKANA LETTER SMALL KU
1804 case 0x31F1: // KATAKANA LETTER SMALL SI
1805 case 0x31F2: // KATAKANA LETTER SMALL SU
1806 case 0x31F3: // KATAKANA LETTER SMALL TO
1807 case 0x31F4: // KATAKANA LETTER SMALL NU
1808 case 0x31F5: // KATAKANA LETTER SMALL HA
1809 case 0x31F6: // KATAKANA LETTER SMALL HI
1810 case 0x31F7: // KATAKANA LETTER SMALL HU
1811 case 0x31F8: // KATAKANA LETTER SMALL HE
1812 case 0x31F9: // KATAKANA LETTER SMALL HO
1813 case 0x31FA: // KATAKANA LETTER SMALL MU
1814 case 0x31FB: // KATAKANA LETTER SMALL RA
1815 case 0x31FC: // KATAKANA LETTER SMALL RI
1816 case 0x31FD: // KATAKANA LETTER SMALL RU
1817 case 0x31FE: // KATAKANA LETTER SMALL RE
1818 case 0x31FF: // KATAKANA LETTER SMALL RO
1819 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1820 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1821 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1822 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1823 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1824 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1825 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1826 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1827 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1833 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1835 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1837 ASSERT(isKanaLetter(character));
1839 switch (character) {
1840 case 0x304C: // HIRAGANA LETTER GA
1841 case 0x304E: // HIRAGANA LETTER GI
1842 case 0x3050: // HIRAGANA LETTER GU
1843 case 0x3052: // HIRAGANA LETTER GE
1844 case 0x3054: // HIRAGANA LETTER GO
1845 case 0x3056: // HIRAGANA LETTER ZA
1846 case 0x3058: // HIRAGANA LETTER ZI
1847 case 0x305A: // HIRAGANA LETTER ZU
1848 case 0x305C: // HIRAGANA LETTER ZE
1849 case 0x305E: // HIRAGANA LETTER ZO
1850 case 0x3060: // HIRAGANA LETTER DA
1851 case 0x3062: // HIRAGANA LETTER DI
1852 case 0x3065: // HIRAGANA LETTER DU
1853 case 0x3067: // HIRAGANA LETTER DE
1854 case 0x3069: // HIRAGANA LETTER DO
1855 case 0x3070: // HIRAGANA LETTER BA
1856 case 0x3073: // HIRAGANA LETTER BI
1857 case 0x3076: // HIRAGANA LETTER BU
1858 case 0x3079: // HIRAGANA LETTER BE
1859 case 0x307C: // HIRAGANA LETTER BO
1860 case 0x3094: // HIRAGANA LETTER VU
1861 case 0x30AC: // KATAKANA LETTER GA
1862 case 0x30AE: // KATAKANA LETTER GI
1863 case 0x30B0: // KATAKANA LETTER GU
1864 case 0x30B2: // KATAKANA LETTER GE
1865 case 0x30B4: // KATAKANA LETTER GO
1866 case 0x30B6: // KATAKANA LETTER ZA
1867 case 0x30B8: // KATAKANA LETTER ZI
1868 case 0x30BA: // KATAKANA LETTER ZU
1869 case 0x30BC: // KATAKANA LETTER ZE
1870 case 0x30BE: // KATAKANA LETTER ZO
1871 case 0x30C0: // KATAKANA LETTER DA
1872 case 0x30C2: // KATAKANA LETTER DI
1873 case 0x30C5: // KATAKANA LETTER DU
1874 case 0x30C7: // KATAKANA LETTER DE
1875 case 0x30C9: // KATAKANA LETTER DO
1876 case 0x30D0: // KATAKANA LETTER BA
1877 case 0x30D3: // KATAKANA LETTER BI
1878 case 0x30D6: // KATAKANA LETTER BU
1879 case 0x30D9: // KATAKANA LETTER BE
1880 case 0x30DC: // KATAKANA LETTER BO
1881 case 0x30F4: // KATAKANA LETTER VU
1882 case 0x30F7: // KATAKANA LETTER VA
1883 case 0x30F8: // KATAKANA LETTER VI
1884 case 0x30F9: // KATAKANA LETTER VE
1885 case 0x30FA: // KATAKANA LETTER VO
1886 return VoicedSoundMark;
1887 case 0x3071: // HIRAGANA LETTER PA
1888 case 0x3074: // HIRAGANA LETTER PI
1889 case 0x3077: // HIRAGANA LETTER PU
1890 case 0x307A: // HIRAGANA LETTER PE
1891 case 0x307D: // HIRAGANA LETTER PO
1892 case 0x30D1: // KATAKANA LETTER PA
1893 case 0x30D4: // KATAKANA LETTER PI
1894 case 0x30D7: // KATAKANA LETTER PU
1895 case 0x30DA: // KATAKANA LETTER PE
1896 case 0x30DD: // KATAKANA LETTER PO
1897 return SemiVoicedSoundMark;
1899 return NoVoicedSoundMark;
1902 static inline bool isCombiningVoicedSoundMark(UChar character)
1904 switch (character) {
1905 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1906 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1912 static inline bool containsKanaLetters(const String& pattern)
1914 if (pattern.is8Bit())
1916 const UChar* characters = pattern.characters16();
1917 unsigned length = pattern.length();
1918 for (unsigned i = 0; i < length; ++i) {
1919 if (isKanaLetter(characters[i]))
1925 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1929 buffer.resize(length);
1931 UErrorCode status = U_ZERO_ERROR;
1932 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1933 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1936 buffer.resize(bufferSize);
1938 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1941 status = U_ZERO_ERROR;
1942 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1943 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1946 static bool isNonLatin1Separator(UChar32 character)
1948 ASSERT_ARG(character, character >= 256);
1950 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1953 static inline bool isSeparator(UChar32 character)
1955 static const bool latin1SeparatorTable[256] = {
1956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1957 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ?
1960 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @
1961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _
1962 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // `
1963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~
1964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1966 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1967 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1969 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1970 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1971 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1974 if (character < 256)
1975 return latin1SeparatorTable[character];
1977 return isNonLatin1Separator(character);
1980 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1981 : m_target(foldQuoteMarks(target))
1982 , m_targetCharacters(StringView(m_target).upconvertedCharacters())
1983 , m_options(options)
1986 , m_needsMoreContext(options & AtWordStarts)
1987 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1989 ASSERT(!m_target.isEmpty());
1991 size_t targetLength = m_target.length();
1992 m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize));
1993 m_overlap = m_buffer.capacity() / 4;
1995 if ((m_options & AtWordStarts) && targetLength) {
1996 UChar32 targetFirstCharacter;
1997 U16_GET(m_target, 0, 0u, targetLength, targetFirstCharacter);
1998 // Characters in the separator category never really occur at the beginning of a word,
1999 // so if the target begins with such a character, we just ignore the AtWordStart option.
2000 if (isSeparator(targetFirstCharacter)) {
2001 m_options &= ~AtWordStarts;
2002 m_needsMoreContext = false;
2006 // Grab the single global searcher.
2007 // If we ever have a reason to do more than once search buffer at once, we'll have
2008 // to move to multiple searchers.
2011 UStringSearch* searcher = WebCore::searcher();
2012 UCollator* collator = usearch_getCollator(searcher);
2014 UCollationStrength strength;
2015 USearchAttributeValue comparator;
2016 if (m_options & CaseInsensitive) {
2017 // Without loss of generality, have 'e' match {'e', 'E', 'é', 'É'} and 'é' match {'é', 'É'}.
2018 strength = UCOL_SECONDARY;
2019 comparator = USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD;
2021 // Without loss of generality, have 'e' match {'e'} and 'é' match {'é'}.
2022 strength = UCOL_TERTIARY;
2023 comparator = USEARCH_STANDARD_ELEMENT_COMPARISON;
2025 if (ucol_getStrength(collator) != strength) {
2026 ucol_setStrength(collator, strength);
2027 usearch_reset(searcher);
2030 UErrorCode status = U_ZERO_ERROR;
2031 usearch_setAttribute(searcher, USEARCH_ELEMENT_COMPARISON, comparator, &status);
2032 ASSERT(status == U_ZERO_ERROR);
2034 usearch_setPattern(searcher, m_targetCharacters, targetLength, &status);
2035 ASSERT(status == U_ZERO_ERROR);
2037 // The kana workaround requires a normalized copy of the target string.
2038 if (m_targetRequiresKanaWorkaround)
2039 normalizeCharacters(m_targetCharacters, targetLength, m_normalizedTarget);
2042 inline SearchBuffer::~SearchBuffer()
2044 // Leave the static object pointing to a valid string.
2045 UErrorCode status = U_ZERO_ERROR;
2046 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
2047 ASSERT(status == U_ZERO_ERROR);
2048 usearch_setText(WebCore::searcher(), &newlineCharacter, 1, &status);
2049 ASSERT(status == U_ZERO_ERROR);
2054 inline size_t SearchBuffer::append(StringView text)
2056 ASSERT(text.length());
2062 } else if (m_buffer.size() == m_buffer.capacity()) {
2063 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
2064 m_prefixLength -= std::min(m_prefixLength, m_buffer.size() - m_overlap);
2065 m_buffer.shrink(m_overlap);
2068 size_t oldLength = m_buffer.size();
2069 size_t usableLength = std::min<size_t>(m_buffer.capacity() - oldLength, text.length());
2070 ASSERT(usableLength);
2071 m_buffer.grow(oldLength + usableLength);
2072 for (unsigned i = 0; i < usableLength; ++i)
2073 m_buffer[oldLength + i] = foldQuoteMark(text[i]);
2074 return usableLength;
2077 inline bool SearchBuffer::needsMoreContext() const
2079 return m_needsMoreContext;
2082 inline void SearchBuffer::prependContext(StringView text)
2084 ASSERT(m_needsMoreContext);
2085 ASSERT(m_prefixLength == m_buffer.size());
2092 size_t wordBoundaryContextStart = text.length();
2093 if (wordBoundaryContextStart) {
2094 U16_BACK_1(text, 0, wordBoundaryContextStart);
2095 wordBoundaryContextStart = startOfLastWordBoundaryContext(text.substring(0, wordBoundaryContextStart));
2098 size_t usableLength = std::min(m_buffer.capacity() - m_prefixLength, text.length() - wordBoundaryContextStart);
2099 WTF::append(m_buffer, text.substring(text.length() - usableLength, usableLength));
2100 m_prefixLength += usableLength;
2102 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2103 m_needsMoreContext = false;
2106 inline bool SearchBuffer::atBreak() const
2111 inline void SearchBuffer::reachedBreak()
2116 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2118 // This function implements the kana workaround. If usearch treats
2119 // it as a match, but we do not want to, then it's a "bad match".
2120 if (!m_targetRequiresKanaWorkaround)
2123 // Normalize into a match buffer. We reuse a single buffer rather than
2124 // creating a new one each time.
2125 normalizeCharacters(match, matchLength, m_normalizedMatch);
2127 const UChar* a = m_normalizedTarget.begin();
2128 const UChar* aEnd = m_normalizedTarget.end();
2130 const UChar* b = m_normalizedMatch.begin();
2131 const UChar* bEnd = m_normalizedMatch.end();
2134 // Skip runs of non-kana-letter characters. This is necessary so we can
2135 // correctly handle strings where the target and match have different-length
2136 // runs of characters that match, while still double checking the correctness
2137 // of matches of kana letters with other kana letters.
2138 while (a != aEnd && !isKanaLetter(*a))
2140 while (b != bEnd && !isKanaLetter(*b))
2143 // If we reached the end of either the target or the match, we should have
2144 // reached the end of both; both should have the same number of kana letters.
2145 if (a == aEnd || b == bEnd) {
2151 // Check for differences in the kana letter character itself.
2152 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2154 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2159 // Check for differences in combining voiced sound marks found after the letter.
2161 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2162 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2166 if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2176 inline bool SearchBuffer::isWordEndMatch(size_t start, size_t length) const
2179 ASSERT(m_options & AtWordEnds);
2182 // Start searching at the end of matched search, so that multiple word matches succeed.
2183 findEndWordBoundary(StringView(m_buffer.data(), m_buffer.size()), start + length - 1, &endWord);
2184 return static_cast<size_t>(endWord) == (start + length);
2187 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2189 ASSERT(m_options & AtWordStarts);
2194 int size = m_buffer.size();
2196 UChar32 firstCharacter;
2197 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2199 if (m_options & TreatMedialCapitalAsWordStart) {
2200 UChar32 previousCharacter;
2201 U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2203 if (isSeparator(firstCharacter)) {
2204 // The start of a separator run is a word start (".org" in "webkit.org").
2205 if (!isSeparator(previousCharacter))
2207 } else if (isASCIIUpper(firstCharacter)) {
2208 // The start of an uppercase run is a word start ("Kit" in "WebKit").
2209 if (!isASCIIUpper(previousCharacter))
2211 // The last character of an uppercase run followed by a non-separator, non-digit
2212 // is a word start ("Request" in "XMLHTTPRequest").
2214 U16_FWD_1(m_buffer.data(), offset, size);
2215 UChar32 nextCharacter = 0;
2217 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2218 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2220 } else if (isASCIIDigit(firstCharacter)) {
2221 // The start of a digit run is a word start ("2" in "WebKit2").
2222 if (!isASCIIDigit(previousCharacter))
2224 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2225 // The start of a non-separator, non-uppercase, non-digit run is a word start,
2226 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2231 // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes
2232 // a word, so treat the position before any CJK character as a word start.
2233 if (FontCascade::isCJKIdeographOrSymbol(firstCharacter))
2236 size_t wordBreakSearchStart = start + length;
2237 while (wordBreakSearchStart > start)
2238 wordBreakSearchStart = findNextWordFromIndex(StringView(m_buffer.data(), m_buffer.size()), wordBreakSearchStart, false /* backwards */);
2239 return wordBreakSearchStart == start;
2242 inline size_t SearchBuffer::search(size_t& start)
2244 size_t size = m_buffer.size();
2249 if (size != m_buffer.capacity())
2253 UStringSearch* searcher = WebCore::searcher();
2255 UErrorCode status = U_ZERO_ERROR;
2256 usearch_setText(searcher, m_buffer.data(), size, &status);
2257 ASSERT(status == U_ZERO_ERROR);
2259 usearch_setOffset(searcher, m_prefixLength, &status);
2260 ASSERT(status == U_ZERO_ERROR);
2262 int matchStart = usearch_next(searcher, &status);
2263 ASSERT(status == U_ZERO_ERROR);
2266 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2267 ASSERT(matchStart == USEARCH_DONE);
2271 // Matches that start in the overlap area are only tentative.
2272 // The same match may appear later, matching more characters,
2273 // possibly including a combining character that's not yet in the buffer.
2274 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2275 size_t overlap = m_overlap;
2276 if (m_options & AtWordStarts) {
2277 // Ensure that there is sufficient context before matchStart the next time around for
2278 // determining if it is at a word boundary.
2279 unsigned wordBoundaryContextStart = matchStart;
2280 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2281 wordBoundaryContextStart = startOfLastWordBoundaryContext(StringView(m_buffer.data(), wordBoundaryContextStart));
2282 overlap = std::min(size - 1, std::max(overlap, size - wordBoundaryContextStart));
2284 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2285 m_prefixLength -= std::min(m_prefixLength, size - overlap);
2286 m_buffer.shrink(overlap);
2290 size_t matchedLength = usearch_getMatchedLength(searcher);
2291 ASSERT_WITH_SECURITY_IMPLICATION(matchStart + matchedLength <= size);
2293 // If this match is "bad", move on to the next match.
2294 if (isBadMatch(m_buffer.data() + matchStart, matchedLength)
2295 || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))
2296 || ((m_options & AtWordEnds) && !isWordEndMatch(matchStart, matchedLength))) {
2297 matchStart = usearch_next(searcher, &status);
2298 ASSERT(status == U_ZERO_ERROR);
2302 size_t newSize = size - (matchStart + 1);
2303 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2304 m_prefixLength -= std::min<size_t>(m_prefixLength, matchStart + 1);
2305 m_buffer.shrink(newSize);
2307 start = size - matchStart;
2308 return matchedLength;
2313 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2314 : m_target(options & CaseInsensitive ? target.foldCase() : target)
2315 , m_options(options)
2316 , m_buffer(m_target.length())
2317 , m_isCharacterStartBuffer(m_target.length())
2318 , m_isBufferFull(false)
2321 ASSERT(!m_target.isEmpty());
2322 m_target.replace(noBreakSpace, ' ');
2323 foldQuoteMarks(m_target);
2326 inline SearchBuffer::~SearchBuffer()
2330 inline void SearchBuffer::reachedBreak()
2333 m_isBufferFull = false;
2336 inline bool SearchBuffer::atBreak() const
2338 return !m_cursor && !m_isBufferFull;
2341 inline void SearchBuffer::append(UChar c, bool isStart)
2343 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMark(c);
2344 m_isCharacterStartBuffer[m_cursor] = isStart;
2345 if (++m_cursor == m_target.length()) {
2347 m_isBufferFull = true;
2351 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2354 if (!(m_options & CaseInsensitive)) {
2355 append(characters[0], true);
2358 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2359 UChar foldedCharacters[maxFoldedCharacters];
2360 UErrorCode status = U_ZERO_ERROR;
2361 int numFoldedCharacters = u_strFoldCase(foldedCharacters, maxFoldedCharacters, characters, 1, U_FOLD_CASE_DEFAULT, &status);
2362 ASSERT(U_SUCCESS(status));
2363 ASSERT(numFoldedCharacters);
2364 ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2365 if (U_SUCCESS(status) && numFoldedCharacters) {
2366 numFoldedCharacters = std::min(numFoldedCharacters, maxFoldedCharacters);
2367 append(foldedCharacters[0], true);
2368 for (int i = 1; i < numFoldedCharacters; ++i)
2369 append(foldedCharacters[i], false);
2374 inline bool SearchBuffer::needsMoreContext() const
2379 void SearchBuffer::prependContext(const UChar*, size_t)
2381 ASSERT_NOT_REACHED();
2384 inline size_t SearchBuffer::search(size_t& start)
2386 if (!m_isBufferFull)
2388 if (!m_isCharacterStartBuffer[m_cursor])
2391 size_t tailSpace = m_target.length() - m_cursor;
2392 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2394 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2399 // Now that we've found a match once, we don't want to find it again, because those
2400 // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2401 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2402 // we want to get rid of that in the future we could track this with a separate boolean
2403 // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2404 m_isCharacterStartBuffer[m_cursor] = false;
2409 // Returns the number of characters that were appended to the buffer (what we are searching in).
2410 // That's not necessarily the same length as the passed-in target string, because case folding
2411 // can make two strings match even though they're not the same length.
2412 size_t SearchBuffer::length() const
2414 size_t bufferSize = m_target.length();
2416 for (size_t i = 0; i < bufferSize; ++i)
2417 length += m_isCharacterStartBuffer[i];
2425 int TextIterator::rangeLength(const Range* range, bool forSelectionPreservation)
2427 unsigned length = 0;
2428 for (TextIterator it(range, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2429 length += it.text().length();
2433 Ref<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2435 CharacterIterator entireRangeIterator(*entireRange);
2436 return characterSubrange(entireRange->ownerDocument(), entireRangeIterator, characterOffset, characterCount);
2439 static inline bool isInsideReplacedElement(TextIterator& iterator)
2441 ASSERT(!iterator.atEnd());
2442 ASSERT(iterator.text().length() == 1);
2443 Node* node = iterator.node();
2444 return node && isRendererReplacedElement(node->renderer());
2447 RefPtr<Range> TextIterator::rangeFromLocationAndLength(ContainerNode* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2449 Ref<Range> resultRange = scope->document().createRange();
2451 int docTextPosition = 0;
2452 int rangeEnd = rangeLocation + rangeLength;
2453 bool startRangeFound = false;
2455 Ref<Range> textRunRange = rangeOfContents(*scope);
2457 TextIterator it(textRunRange.ptr(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2459 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2460 if (!rangeLocation && !rangeLength && it.atEnd()) {
2461 resultRange->setStart(textRunRange->startContainer(), 0);
2462 resultRange->setEnd(textRunRange->startContainer(), 0);
2463 return WTFMove(resultRange);
2466 for (; !it.atEnd(); it.advance()) {
2467 int length = it.text().length();
2468 textRunRange = it.range();
2470 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + length;
2471 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + length;
2474 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2475 // position for emitted '\n's or if the renderer of the current node is a replaced element.
2476 if (length == 1 && (it.text()[0] == '\n' || isInsideReplacedElement(it))) {
2479 Ref<Range> range = it.range();
2480 textRunRange->setEnd(range->startContainer(), range->startOffset());
2482 Position runStart = textRunRange->startPosition();
2483 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2484 if (runEnd.isNotNull())
2485 textRunRange->setEnd(*runEnd.containerNode(), runEnd.computeOffsetInContainerNode());
2491 startRangeFound = true;
2492 if (textRunRange->startContainer().isTextNode()) {
2493 int offset = rangeLocation - docTextPosition;
2494 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset());
2496 if (rangeLocation == docTextPosition)
2497 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset());
2499 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset());
2504 if (textRunRange->startContainer().isTextNode()) {
2505 int offset = rangeEnd - docTextPosition;
2506 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset());
2508 if (rangeEnd == docTextPosition)
2509 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset());
2511 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset());
2513 docTextPosition += length;
2517 docTextPosition += length;
2520 if (!startRangeFound)
2523 if (rangeLength && rangeEnd > docTextPosition) // rangeEnd is out of bounds
2524 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset());
2526 return WTFMove(resultRange);
2529 bool TextIterator::getLocationAndLengthFromRange(Node* scope, const Range* range, size_t& location, size_t& length)
2531 location = notFound;
2534 // The critical assumption is that this only gets called with ranges that
2535 // concentrate on a given area containing the selection root. This is done
2536 // because of text fields and textareas. The DOM for those is not
2537 // directly in the document DOM, so ensure that the range does not cross a
2538 // boundary of one of those.
2539 if (&range->startContainer() != scope && !range->startContainer().isDescendantOf(scope))
2541 if (&range->endContainer() != scope && !range->endContainer().isDescendantOf(scope))
2544 Ref<Range> testRange = Range::create(scope->document(), scope, 0, &range->startContainer(), range->startOffset());
2545 ASSERT(&testRange->startContainer() == scope);
2546 location = TextIterator::rangeLength(testRange.ptr());
2548 testRange->setEnd(range->endContainer(), range->endOffset());
2549 ASSERT(&testRange->startContainer() == scope);
2550 length = TextIterator::rangeLength(testRange.ptr()) - location;
2556 String plainText(const Range* r, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2558 // The initial buffer size can be critical for performance: https://bugs.webkit.org/show_bug.cgi?id=81192
2559 static const unsigned initialCapacity = 1 << 15;
2561 unsigned bufferLength = 0;
2562 StringBuilder builder;
2563 builder.reserveCapacity(initialCapacity);
2564 TextIteratorBehavior behavior = defaultBehavior;
2565 if (!isDisplayString)
2566 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2568 for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2569 it.appendTextToStringBuilder(builder);
2570 bufferLength += it.text().length();
2574 return emptyString();
2576 String result = builder.toString();
2578 if (isDisplayString)
2579 r->ownerDocument().displayStringModifiedByEncoding(result);
2584 String plainTextReplacingNoBreakSpace(const Range* range, TextIteratorBehavior defaultBehavior, bool isDisplayString)
2586 return plainText(range, defaultBehavior, isDisplayString).replace(noBreakSpace, ' ');
2589 static Ref<Range> collapsedToBoundary(const Range& range, bool forward)
2591 Ref<Range> result = range.cloneRange();
2592 result->collapse(!forward);
2596 static std::optional<std::pair<size_t, size_t>> findPlainTextOffset(SearchBuffer& buffer, CharacterIterator& findIterator, bool searchForward)
2598 size_t matchStart = 0;
2599 size_t matchLength = 0;
2600 while (!findIterator.atEnd()) {
2601 findIterator.advance(buffer.append(findIterator.text()));
2603 size_t matchStartOffset;
2604 size_t newMatchLength = buffer.search(matchStartOffset);
2605 if (!newMatchLength) {
2606 if (findIterator.atBreak() && !buffer.atBreak()) {
2607 buffer.reachedBreak();
2612 size_t lastCharacterInBufferOffset = findIterator.characterOffset();
2613 ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2614 matchStart = lastCharacterInBufferOffset - matchStartOffset;
2615 matchLength = newMatchLength;
2616 if (searchForward) // Look for the last match when searching backwards instead.
2617 return std::pair<size_t, size_t> { matchStart, matchLength };
2622 return std::nullopt;
2624 return std::pair<size_t, size_t> { matchStart, matchLength };
2627 Ref<Range> findPlainText(const Range& range, const String& target, FindOptions options)
2629 SearchBuffer buffer(target, options);
2631 if (buffer.needsMoreContext()) {
2632 Ref<Range> beforeStartRange = range.ownerDocument().createRange();
2633 beforeStartRange->setEnd(range.startContainer(), range.startOffset());
2634 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2635 buffer.prependContext(backwardsIterator.text());
2636 if (!buffer.needsMoreContext())
2641 bool searchForward = !(options & Backwards);
2642 TextIteratorBehavior iteratorOptions = TextIteratorEntersTextControls | TextIteratorClipsToFrameAncestors;
2644 CharacterIterator findIterator(range, iteratorOptions);
2645 auto result = findPlainTextOffset(buffer, findIterator, searchForward);
2647 return collapsedToBoundary(range, searchForward);
2649 CharacterIterator rangeComputeIterator(range, iteratorOptions);
2650 return characterSubrange(range.ownerDocument(), rangeComputeIterator, result->first, result->second);