Unreviewed, fix -Wmisleading-indentation warning introduced in r246764
[WebKit-https.git] / Source / WebCore / contentextensions / CombinedURLFilters.cpp
1 /*
2  * Copyright (C) 2015 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "CombinedURLFilters.h"
28
29 #if ENABLE(CONTENT_EXTENSIONS)
30
31 #include "HashableActionList.h"
32 #include "Term.h"
33 #include <wtf/DataLog.h>
34 #include <wtf/Vector.h>
35 #include <wtf/text/CString.h>
36
37 namespace WebCore {
38
39 namespace ContentExtensions {
40
41 struct PrefixTreeEdge {
42     const Term* term;
43     std::unique_ptr<PrefixTreeVertex> child;
44 };
45     
46 typedef Vector<PrefixTreeEdge, 0, WTF::CrashOnOverflow, 1> PrefixTreeEdges;
47
48 struct PrefixTreeVertex {
49     WTF_MAKE_STRUCT_FAST_ALLOCATED;
50
51     PrefixTreeEdges edges;
52 };
53
54 struct ReverseSuffixTreeVertex;
55 struct ReverseSuffixTreeEdge {
56     const Term* term;
57     std::unique_ptr<ReverseSuffixTreeVertex> child;
58 };
59 typedef Vector<ReverseSuffixTreeEdge, 0, WTF::CrashOnOverflow, 1> ReverseSuffixTreeEdges;
60
61 struct ReverseSuffixTreeVertex {
62     ReverseSuffixTreeEdges edges;
63     uint32_t nodeId;
64 };
65 typedef HashMap<HashableActionList, ReverseSuffixTreeVertex, HashableActionListHash, HashableActionListHashTraits> ReverseSuffixTreeRoots;
66
67 #if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
68 static size_t recursiveMemoryUsed(const PrefixTreeVertex& vertex)
69 {
70     size_t size = sizeof(PrefixTreeVertex)
71         + vertex.edges.capacity() * sizeof(PrefixTreeEdge);
72     for (const auto& edge : vertex.edges) {
73         ASSERT(edge.child);
74         size += recursiveMemoryUsed(*edge.child.get());
75     }
76     return size;
77 }
78
79 size_t CombinedURLFilters::memoryUsed() const
80 {
81     ASSERT(m_prefixTreeRoot);
82
83     size_t actionsSize = 0;
84     for (const auto& slot : m_actions)
85         actionsSize += slot.value.capacity() * sizeof(uint64_t);
86
87     return sizeof(CombinedURLFilters)
88         + m_alphabet.memoryUsed()
89         + recursiveMemoryUsed(*m_prefixTreeRoot.get())
90         + sizeof(HashMap<PrefixTreeVertex*, ActionList>)
91         + m_actions.capacity() * (sizeof(PrefixTreeVertex*) + sizeof(ActionList))
92         + actionsSize;
93 }
94 #endif
95     
96 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
97 static String prefixTreeVertexToString(const PrefixTreeVertex& vertex, const HashMap<const PrefixTreeVertex*, ActionList>& actions, unsigned depth)
98 {
99     StringBuilder builder;
100     while (depth--)
101         builder.appendLiteral("  ");
102     builder.appendLiteral("vertex actions: ");
103
104     auto actionsSlot = actions.find(&vertex);
105     if (actionsSlot != actions.end()) {
106         for (uint64_t action : actionsSlot->value) {
107             builder.appendNumber(action);
108             builder.append(',');
109         }
110     }
111     builder.append('\n');
112     return builder.toString();
113 }
114
115 static void recursivePrint(const PrefixTreeVertex& vertex, const HashMap<const PrefixTreeVertex*, ActionList>& actions, unsigned depth)
116 {
117     dataLogF("%s", prefixTreeVertexToString(vertex, actions, depth).utf8().data());
118     for (const auto& edge : vertex.edges) {
119         StringBuilder builder;
120         for (unsigned i = 0; i < depth * 2; ++i)
121             builder.append(' ');
122         builder.appendLiteral("vertex edge: ");
123         builder.append(edge.term->toString());
124         builder.append('\n');
125         dataLogF("%s", builder.toString().utf8().data());
126         ASSERT(edge.child);
127         recursivePrint(*edge.child.get(), actions, depth + 1);
128     }
129 }
130
131 void CombinedURLFilters::print() const
132 {
133     recursivePrint(*m_prefixTreeRoot.get(), m_actions, 0);
134 }
135 #endif
136
137 CombinedURLFilters::CombinedURLFilters()
138     : m_prefixTreeRoot(std::make_unique<PrefixTreeVertex>())
139 {
140 }
141
142 CombinedURLFilters::~CombinedURLFilters() = default;
143
144 bool CombinedURLFilters::isEmpty() const
145 {
146     return m_prefixTreeRoot->edges.isEmpty();
147 }
148
149 void CombinedURLFilters::addDomain(uint64_t actionId, const String& domain)
150 {
151     unsigned domainLength = domain.length();
152     if (domainLength && domain[0] == '*') {
153         // If domain starts with a '*' then it means match domain and its subdomains, like (^|.)domain$
154         // This way a domain of "*webkit.org" will match "bugs.webkit.org" and "webkit.org".
155         Vector<Term> prependDot;
156         Vector<Term> prependBeginningOfLine;
157         prependDot.reserveInitialCapacity(domainLength + 2);
158         prependBeginningOfLine.reserveInitialCapacity(domainLength); // This is just no .* at the beginning.
159         
160         Term canonicalDotStar(Term::UniversalTransition);
161         canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore);
162         prependDot.uncheckedAppend(canonicalDotStar);
163         prependDot.uncheckedAppend(Term('.', true));
164         
165         for (unsigned i = 1; i < domainLength; i++) {
166             ASSERT(isASCII(domain[i]));
167             ASSERT(!isASCIIUpper(domain[i]));
168             prependDot.uncheckedAppend(Term(domain[i], true));
169             prependBeginningOfLine.uncheckedAppend(Term(domain[i], true));
170         }
171         prependDot.uncheckedAppend(Term::EndOfLineAssertionTerm);
172         prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm);
173         
174         addPattern(actionId, prependDot);
175         addPattern(actionId, prependBeginningOfLine);
176     } else {
177         // This is like adding ^domain$, but interpreting domain as a series of characters, not a regular expression.
178         // "webkit.org" will match "webkit.org" but not "bugs.webkit.org".
179         Vector<Term> prependBeginningOfLine;
180         prependBeginningOfLine.reserveInitialCapacity(domainLength + 1); // This is just no .* at the beginning.
181         for (unsigned i = 0; i < domainLength; i++) {
182             ASSERT(isASCII(domain[i]));
183             ASSERT(!isASCIIUpper(domain[i]));
184             prependBeginningOfLine.uncheckedAppend(Term(domain[i], true));
185         }
186         prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm);
187         addPattern(actionId, prependBeginningOfLine);
188     }
189 }
190
191 void CombinedURLFilters::addPattern(uint64_t actionId, const Vector<Term>& pattern)
192 {
193     ASSERT_WITH_MESSAGE(!pattern.isEmpty(), "The parser should have excluded empty patterns before reaching CombinedURLFilters.");
194
195     if (pattern.isEmpty())
196         return;
197
198     // Extend the prefix tree with the new pattern.
199     PrefixTreeVertex* lastPrefixTree = m_prefixTreeRoot.get();
200
201     for (const Term& term : pattern) {
202         size_t nextEntryIndex = WTF::notFound;
203         for (size_t i = 0; i < lastPrefixTree->edges.size(); ++i) {
204             if (*lastPrefixTree->edges[i].term == term) {
205                 nextEntryIndex = i;
206                 break;
207             }
208         }
209         if (nextEntryIndex != WTF::notFound)
210             lastPrefixTree = lastPrefixTree->edges[nextEntryIndex].child.get();
211         else {
212             lastPrefixTree->edges.append(PrefixTreeEdge({m_alphabet.interned(term), std::make_unique<PrefixTreeVertex>()}));
213             lastPrefixTree = lastPrefixTree->edges.last().child.get();
214         }
215     }
216
217     auto addResult = m_actions.add(lastPrefixTree, ActionList());
218     ActionList& actions = addResult.iterator->value;
219     if (actions.find(actionId) == WTF::notFound)
220         actions.append(actionId);
221 }
222
223 struct ActiveSubtree {
224     ActiveSubtree(PrefixTreeVertex& vertex, ImmutableCharNFANodeBuilder&& nfaNode, unsigned edgeIndex)
225         : vertex(vertex)
226         , nfaNode(WTFMove(nfaNode))
227         , edgeIndex(edgeIndex)
228     {
229     }
230     PrefixTreeVertex& vertex;
231     ImmutableCharNFANodeBuilder nfaNode;
232     unsigned edgeIndex;
233 };
234
235 static void generateInfixUnsuitableForReverseSuffixTree(NFA& nfa, Vector<ActiveSubtree>& stack, const HashMap<const PrefixTreeVertex*, ActionList>& actions)
236 {
237     // To avoid conflicts, we use the reverse suffix tree for subtrees that do not merge
238     // in the prefix tree.
239     //
240     // We only unify the suffixes to the actions on the leaf.
241     // If there are actions inside the tree, we generate the part of the subtree up to the action.
242     //
243     // If we accidentally insert a node with action inside the reverse-suffix-tree, we would create
244     // new actions on unrelated pattern when unifying their suffixes.
245     for (unsigned i = stack.size() - 1; i--;) {
246         ActiveSubtree& activeSubtree = stack[i];
247         if (activeSubtree.nfaNode.isValid())
248             return;
249
250         RELEASE_ASSERT_WITH_MESSAGE(i > 0, "The bottom of the stack must be the root of our fixed-length subtree. It should have it the isValid() case above.");
251
252         auto actionsIterator = actions.find(&activeSubtree.vertex);
253         bool hasActionInsideTree = actionsIterator != actions.end();
254
255         // Stricto sensu, we should count the number of exit edges with fixed length.
256         // That is costly and unlikely to matter in practice.
257         bool hasSingleOutcome = activeSubtree.vertex.edges.size() == 1;
258
259         if (hasActionInsideTree || !hasSingleOutcome) {
260             // Go back to the end of the subtree that has already been generated.
261             // From there, generate everything up to the vertex we found.
262             unsigned end = i;
263             unsigned beginning = end;
264
265             ActiveSubtree* sourceActiveSubtree = nullptr;
266             while (beginning--) {
267                 ActiveSubtree& activeSubtree = stack[beginning];
268                 if (activeSubtree.nfaNode.isValid()) {
269                     sourceActiveSubtree = &activeSubtree;
270                     break;
271                 }
272             }
273             ASSERT_WITH_MESSAGE(sourceActiveSubtree, "The root should always have a valid generator.");
274
275             for (unsigned stackIndex = beginning + 1; stackIndex <= end; ++stackIndex) {
276                 ImmutableCharNFANodeBuilder& sourceNode = sourceActiveSubtree->nfaNode;
277                 ASSERT(sourceNode.isValid());
278                 auto& edge = sourceActiveSubtree->vertex.edges[sourceActiveSubtree->edgeIndex];
279
280                 ActiveSubtree& destinationActiveSubtree = stack[stackIndex];
281                 destinationActiveSubtree.nfaNode = edge.term->generateGraph(nfa, sourceNode, actions.get(&destinationActiveSubtree.vertex));
282
283                 sourceActiveSubtree = &destinationActiveSubtree;
284             }
285
286             return;
287         }
288     }
289 }
290
291 static void generateSuffixWithReverseSuffixTree(NFA& nfa, Vector<ActiveSubtree>& stack, const HashMap<const PrefixTreeVertex*, ActionList>& actions, ReverseSuffixTreeRoots& reverseSuffixTreeRoots)
292 {
293     ActiveSubtree& leafSubtree = stack.last();
294     ASSERT_WITH_MESSAGE(!leafSubtree.nfaNode.isValid(), "The leaf should never be generated by the code above, it should always be inserted into the prefix tree.");
295
296     ActionList actionList = actions.get(&leafSubtree.vertex);
297     ASSERT_WITH_MESSAGE(!actionList.isEmpty(), "Our prefix tree should always have actions on the leaves by construction.");
298
299     HashableActionList hashableActionList(actionList);
300     auto rootAddResult = reverseSuffixTreeRoots.add(hashableActionList, ReverseSuffixTreeVertex());
301     if (rootAddResult.isNewEntry) {
302         ImmutableCharNFANodeBuilder newNode(nfa);
303         newNode.setActions(actionList.begin(), actionList.end());
304         rootAddResult.iterator->value.nodeId = newNode.nodeId();
305     }
306
307     ReverseSuffixTreeVertex* activeReverseSuffixTreeVertex = &rootAddResult.iterator->value;
308     uint32_t destinationNodeId = rootAddResult.iterator->value.nodeId;
309
310     unsigned stackPosition = stack.size() - 2;
311     while (true) {
312         ActiveSubtree& source = stack[stackPosition];
313         auto& edge = source.vertex.edges[source.edgeIndex];
314
315         // This is the end condition: when we meet a node that has already been generated,
316         // we just need to connect our backward tree to the forward tree.
317         //
318         // We *must not* add this last node to the reverse-suffix tree. That node can have
319         // transitions back to earlier part of the prefix tree. If the prefix tree "caches"
320         // such node, it would create new transitions that did not exist in the source language.
321         if (source.nfaNode.isValid()) {
322             stack.shrink(stackPosition + 1);
323             edge.term->generateGraph(nfa, source.nfaNode, destinationNodeId);
324             return;
325         }
326         --stackPosition;
327
328         ASSERT_WITH_MESSAGE(!actions.contains(&source.vertex), "Any node with final actions should have been created before hitting the reverse suffix-tree.");
329
330         ReverseSuffixTreeEdge* existingEdge = nullptr;
331         for (ReverseSuffixTreeEdge& potentialExistingEdge : activeReverseSuffixTreeVertex->edges) {
332             if (edge.term == potentialExistingEdge.term) {
333                 existingEdge = &potentialExistingEdge;
334                 break;
335             }
336         }
337
338         if (existingEdge)
339             activeReverseSuffixTreeVertex = existingEdge->child.get();
340         else {
341             ImmutableCharNFANodeBuilder newNode(nfa);
342             edge.term->generateGraph(nfa, newNode, destinationNodeId);
343             std::unique_ptr<ReverseSuffixTreeVertex> newVertex(new ReverseSuffixTreeVertex());
344             newVertex->nodeId = newNode.nodeId();
345
346             ReverseSuffixTreeVertex* newVertexAddress = newVertex.get();
347             activeReverseSuffixTreeVertex->edges.append(ReverseSuffixTreeEdge({ edge.term, WTFMove(newVertex) }));
348             activeReverseSuffixTreeVertex = newVertexAddress;
349         }
350         destinationNodeId = activeReverseSuffixTreeVertex->nodeId;
351
352         ASSERT(source.vertex.edges.size() == 1);
353         source.vertex.edges.clear();
354     }
355
356     RELEASE_ASSERT_NOT_REACHED();
357 }
358
359 static void clearReverseSuffixTree(ReverseSuffixTreeRoots& reverseSuffixTreeRoots)
360 {
361     // We cannot rely on the destructor being called in order from top to bottom as we may overflow
362     // the stack. Instead, we go depth first in the reverse-suffix-tree.
363
364     for (auto& slot : reverseSuffixTreeRoots) {
365         Vector<ReverseSuffixTreeVertex*, 128> stack;
366         stack.append(&slot.value);
367
368         while (true) {
369             ReverseSuffixTreeVertex* top = stack.last();
370             if (top->edges.isEmpty()) {
371                 stack.removeLast();
372                 if (stack.isEmpty())
373                     break;
374                 stack.last()->edges.removeLast();
375             } else
376                 stack.append(top->edges.last().child.get());
377         }
378     }
379     reverseSuffixTreeRoots.clear();
380 }
381
382 static void generateNFAForSubtree(NFA& nfa, ImmutableCharNFANodeBuilder&& subtreeRoot, PrefixTreeVertex& root, const HashMap<const PrefixTreeVertex*, ActionList>& actions, size_t maxNFASize)
383 {
384     // This recurses the subtree of the prefix tree.
385     // For each edge that has fixed length (no quantifiers like ?, *, or +) it generates the nfa graph,
386     // recurses into children, and deletes any processed leaf nodes.
387
388     ReverseSuffixTreeRoots reverseSuffixTreeRoots;
389     Vector<ActiveSubtree> stack;
390     if (!root.edges.isEmpty())
391         stack.append(ActiveSubtree(root, WTFMove(subtreeRoot), 0));
392
393     bool nfaTooBig = false;
394     
395     // Generate graphs for each subtree that does not contain any quantifiers.
396     while (!stack.isEmpty()) {
397         PrefixTreeVertex& vertex = stack.last().vertex;
398         const unsigned edgeIndex = stack.last().edgeIndex;
399
400         if (edgeIndex < vertex.edges.size()) {
401             auto& edge = vertex.edges[edgeIndex];
402
403             // Clean up any processed leaves and return early if we are past the maxNFASize.
404             if (nfaTooBig) {
405                 stack.last().edgeIndex = stack.last().vertex.edges.size();
406                 continue;
407             }
408             
409             // Quantified edges in the subtree will be a part of another NFA.
410             if (!edge.term->hasFixedLength()) {
411                 stack.last().edgeIndex++;
412                 continue;
413             }
414
415             ASSERT(edge.child.get());
416             ImmutableCharNFANodeBuilder emptyBuilder;
417             stack.append(ActiveSubtree(*edge.child.get(), WTFMove(emptyBuilder), 0));
418         } else {
419             bool isLeaf = vertex.edges.isEmpty();
420
421             ASSERT(edgeIndex == vertex.edges.size());
422             vertex.edges.removeAllMatching([](PrefixTreeEdge& edge)
423             {
424                 return !edge.term;
425             });
426
427             if (isLeaf) {
428                 generateInfixUnsuitableForReverseSuffixTree(nfa, stack, actions);
429                 generateSuffixWithReverseSuffixTree(nfa, stack, actions, reverseSuffixTreeRoots);
430
431                 // Only stop generating an NFA at a leaf to ensure we have a correct NFA. We could go slightly over the maxNFASize.
432                 if (nfa.nodes.size() > maxNFASize)
433                     nfaTooBig = true;
434             } else
435                 stack.removeLast();
436
437             if (!stack.isEmpty()) {
438                 auto& activeSubtree = stack.last();
439                 auto& edge = activeSubtree.vertex.edges[stack.last().edgeIndex];
440                 if (edge.child->edges.isEmpty())
441                     edge.term = nullptr; // Mark this leaf for deleting.
442                 activeSubtree.edgeIndex++;
443             }
444         }
445     }
446     clearReverseSuffixTree(reverseSuffixTreeRoots);
447 }
448
449 void CombinedURLFilters::processNFAs(size_t maxNFASize, const WTF::Function<void(NFA&&)>& handler)
450 {
451 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
452     print();
453 #endif
454     while (true) {
455         // Traverse out to a leaf.
456         Vector<PrefixTreeVertex*, 128> stack;
457         PrefixTreeVertex* vertex = m_prefixTreeRoot.get();
458         while (true) {
459             ASSERT(vertex);
460             stack.append(vertex);
461             if (vertex->edges.isEmpty())
462                 break;
463             vertex = vertex->edges.last().child.get();
464         }
465         if (stack.size() == 1)
466             break; // We're done once we have processed and removed all the edges in the prefix tree.
467         
468         // Find the prefix root for this NFA. This is the vertex after the last term with a quantifier if there is one,
469         // or the root if there are no quantifiers left.
470         while (stack.size() > 1) {
471             if (!stack[stack.size() - 2]->edges.last().term->hasFixedLength())
472                 break;
473             stack.removeLast();
474         }
475         ASSERT_WITH_MESSAGE(!stack.isEmpty(), "At least the root should be in the stack");
476
477         // Make an NFA with the subtrees for whom this is also the last quantifier (or who also have no quantifier).
478         NFA nfa;
479         {
480             // Put the prefix into the NFA.
481             ImmutableCharNFANodeBuilder lastNode(nfa);
482             for (unsigned i = 0; i < stack.size() - 1; ++i) {
483                 const PrefixTreeEdge& edge = stack[i]->edges.last();
484                 ImmutableCharNFANodeBuilder newNode = edge.term->generateGraph(nfa, lastNode, m_actions.get(edge.child.get()));
485                 lastNode = WTFMove(newNode);
486             }
487
488             // Put the non-quantified vertices in the subtree into the NFA and delete them.
489             ASSERT(stack.last());
490             generateNFAForSubtree(nfa, WTFMove(lastNode), *stack.last(), m_actions, maxNFASize);
491         }
492         nfa.finalize();
493
494         handler(WTFMove(nfa));
495
496         // Clean up any processed leaf nodes.
497         while (true) {
498             if (stack.size() > 1) {
499                 if (stack[stack.size() - 1]->edges.isEmpty()) {
500                     stack[stack.size() - 2]->edges.removeLast();
501                     stack.removeLast();
502                 } else
503                     break; // Vertex is not a leaf.
504             } else
505                 break; // Leave the empty root.
506         }
507     }
508 }
509
510 } // namespace ContentExtensions
511 } // namespace WebCore
512
513 #endif // ENABLE(CONTENT_EXTENSIONS)