Use "= default" to denote default constructor or destructor
[WebKit-https.git] / Source / WebCore / contentextensions / CombinedURLFilters.cpp
1 /*
2  * Copyright (C) 2015 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23  * THE POSSIBILITY OF SUCH DAMAGE.
24  */
25
26 #include "config.h"
27 #include "CombinedURLFilters.h"
28
29 #if ENABLE(CONTENT_EXTENSIONS)
30
31 #include "HashableActionList.h"
32 #include "Term.h"
33 #include <wtf/DataLog.h>
34 #include <wtf/Vector.h>
35 #include <wtf/text/CString.h>
36
37 namespace WebCore {
38
39 namespace ContentExtensions {
40
41 struct PrefixTreeEdge {
42     const Term* term;
43     std::unique_ptr<PrefixTreeVertex> child;
44 };
45     
46 typedef Vector<PrefixTreeEdge, 0, WTF::CrashOnOverflow, 1> PrefixTreeEdges;
47
48 struct PrefixTreeVertex {
49     PrefixTreeEdges edges;
50 };
51
52 struct ReverseSuffixTreeVertex;
53 struct ReverseSuffixTreeEdge {
54     const Term* term;
55     std::unique_ptr<ReverseSuffixTreeVertex> child;
56 };
57 typedef Vector<ReverseSuffixTreeEdge, 0, WTF::CrashOnOverflow, 1> ReverseSuffixTreeEdges;
58
59 struct ReverseSuffixTreeVertex {
60     ReverseSuffixTreeEdges edges;
61     uint32_t nodeId;
62 };
63 typedef HashMap<HashableActionList, ReverseSuffixTreeVertex, HashableActionListHash, HashableActionListHashTraits> ReverseSuffixTreeRoots;
64
65 #if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
66 static size_t recursiveMemoryUsed(const PrefixTreeVertex& vertex)
67 {
68     size_t size = sizeof(PrefixTreeVertex)
69         + vertex.edges.capacity() * sizeof(PrefixTreeEdge);
70     for (const auto& edge : vertex.edges) {
71         ASSERT(edge.child);
72         size += recursiveMemoryUsed(*edge.child.get());
73     }
74     return size;
75 }
76
77 size_t CombinedURLFilters::memoryUsed() const
78 {
79     ASSERT(m_prefixTreeRoot);
80
81     size_t actionsSize = 0;
82     for (const auto& slot : m_actions)
83         actionsSize += slot.value.capacity() * sizeof(uint64_t);
84
85     return sizeof(CombinedURLFilters)
86         + m_alphabet.memoryUsed()
87         + recursiveMemoryUsed(*m_prefixTreeRoot.get())
88         + sizeof(HashMap<PrefixTreeVertex*, ActionList>)
89         + m_actions.capacity() * (sizeof(PrefixTreeVertex*) + sizeof(ActionList))
90         + actionsSize;
91 }
92 #endif
93     
94 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
95 static String prefixTreeVertexToString(const PrefixTreeVertex& vertex, const HashMap<const PrefixTreeVertex*, ActionList>& actions, unsigned depth)
96 {
97     StringBuilder builder;
98     while (depth--)
99         builder.appendLiteral("  ");
100     builder.appendLiteral("vertex actions: ");
101
102     auto actionsSlot = actions.find(&vertex);
103     if (actionsSlot != actions.end()) {
104         for (uint64_t action : actionsSlot->value) {
105             builder.appendNumber(action);
106             builder.append(',');
107         }
108     }
109     builder.append('\n');
110     return builder.toString();
111 }
112
113 static void recursivePrint(const PrefixTreeVertex& vertex, const HashMap<const PrefixTreeVertex*, ActionList>& actions, unsigned depth)
114 {
115     dataLogF("%s", prefixTreeVertexToString(vertex, actions, depth).utf8().data());
116     for (const auto& edge : vertex.edges) {
117         StringBuilder builder;
118         for (unsigned i = 0; i < depth * 2; ++i)
119             builder.append(' ');
120         builder.appendLiteral("vertex edge: ");
121         builder.append(edge.term->toString());
122         builder.append('\n');
123         dataLogF("%s", builder.toString().utf8().data());
124         ASSERT(edge.child);
125         recursivePrint(*edge.child.get(), actions, depth + 1);
126     }
127 }
128
129 void CombinedURLFilters::print() const
130 {
131     recursivePrint(*m_prefixTreeRoot.get(), m_actions, 0);
132 }
133 #endif
134
135 CombinedURLFilters::CombinedURLFilters()
136     : m_prefixTreeRoot(std::make_unique<PrefixTreeVertex>())
137 {
138 }
139
140 CombinedURLFilters::~CombinedURLFilters() = default;
141
142 bool CombinedURLFilters::isEmpty() const
143 {
144     return m_prefixTreeRoot->edges.isEmpty();
145 }
146
147 void CombinedURLFilters::addDomain(uint64_t actionId, const String& domain)
148 {
149     unsigned domainLength = domain.length();
150     if (domainLength && domain[0] == '*') {
151         // If domain starts with a '*' then it means match domain and its subdomains, like (^|.)domain$
152         // This way a domain of "*webkit.org" will match "bugs.webkit.org" and "webkit.org".
153         Vector<Term> prependDot;
154         Vector<Term> prependBeginningOfLine;
155         prependDot.reserveInitialCapacity(domainLength + 2);
156         prependBeginningOfLine.reserveInitialCapacity(domainLength); // This is just no .* at the beginning.
157         
158         Term canonicalDotStar(Term::UniversalTransition);
159         canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore);
160         prependDot.uncheckedAppend(canonicalDotStar);
161         prependDot.uncheckedAppend(Term('.', true));
162         
163         for (unsigned i = 1; i < domainLength; i++) {
164             ASSERT(isASCII(domain[i]));
165             ASSERT(!isASCIIUpper(domain[i]));
166             prependDot.uncheckedAppend(Term(domain[i], true));
167             prependBeginningOfLine.uncheckedAppend(Term(domain[i], true));
168         }
169         prependDot.uncheckedAppend(Term::EndOfLineAssertionTerm);
170         prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm);
171         
172         addPattern(actionId, prependDot);
173         addPattern(actionId, prependBeginningOfLine);
174     } else {
175         // This is like adding ^domain$, but interpreting domain as a series of characters, not a regular expression.
176         // "webkit.org" will match "webkit.org" but not "bugs.webkit.org".
177         Vector<Term> prependBeginningOfLine;
178         prependBeginningOfLine.reserveInitialCapacity(domainLength + 1); // This is just no .* at the beginning.
179         for (unsigned i = 0; i < domainLength; i++) {
180             ASSERT(isASCII(domain[i]));
181             ASSERT(!isASCIIUpper(domain[i]));
182             prependBeginningOfLine.uncheckedAppend(Term(domain[i], true));
183         }
184         prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm);
185         addPattern(actionId, prependBeginningOfLine);
186     }
187 }
188
189 void CombinedURLFilters::addPattern(uint64_t actionId, const Vector<Term>& pattern)
190 {
191     ASSERT_WITH_MESSAGE(!pattern.isEmpty(), "The parser should have excluded empty patterns before reaching CombinedURLFilters.");
192
193     if (pattern.isEmpty())
194         return;
195
196     // Extend the prefix tree with the new pattern.
197     PrefixTreeVertex* lastPrefixTree = m_prefixTreeRoot.get();
198
199     for (const Term& term : pattern) {
200         size_t nextEntryIndex = WTF::notFound;
201         for (size_t i = 0; i < lastPrefixTree->edges.size(); ++i) {
202             if (*lastPrefixTree->edges[i].term == term) {
203                 nextEntryIndex = i;
204                 break;
205             }
206         }
207         if (nextEntryIndex != WTF::notFound)
208             lastPrefixTree = lastPrefixTree->edges[nextEntryIndex].child.get();
209         else {
210             lastPrefixTree->edges.append(PrefixTreeEdge({m_alphabet.interned(term), std::make_unique<PrefixTreeVertex>()}));
211             lastPrefixTree = lastPrefixTree->edges.last().child.get();
212         }
213     }
214
215     auto addResult = m_actions.add(lastPrefixTree, ActionList());
216     ActionList& actions = addResult.iterator->value;
217     if (actions.find(actionId) == WTF::notFound)
218         actions.append(actionId);
219 }
220
221 struct ActiveSubtree {
222     ActiveSubtree(PrefixTreeVertex& vertex, ImmutableCharNFANodeBuilder&& nfaNode, unsigned edgeIndex)
223         : vertex(vertex)
224         , nfaNode(WTFMove(nfaNode))
225         , edgeIndex(edgeIndex)
226     {
227     }
228     PrefixTreeVertex& vertex;
229     ImmutableCharNFANodeBuilder nfaNode;
230     unsigned edgeIndex;
231 };
232
233 static void generateInfixUnsuitableForReverseSuffixTree(NFA& nfa, Vector<ActiveSubtree>& stack, const HashMap<const PrefixTreeVertex*, ActionList>& actions)
234 {
235     // To avoid conflicts, we use the reverse suffix tree for subtrees that do not merge
236     // in the prefix tree.
237     //
238     // We only unify the suffixes to the actions on the leaf.
239     // If there are actions inside the tree, we generate the part of the subtree up to the action.
240     //
241     // If we accidentally insert a node with action inside the reverse-suffix-tree, we would create
242     // new actions on unrelated pattern when unifying their suffixes.
243     for (unsigned i = stack.size() - 1; i--;) {
244         ActiveSubtree& activeSubtree = stack[i];
245         if (activeSubtree.nfaNode.isValid())
246             return;
247
248         RELEASE_ASSERT_WITH_MESSAGE(i > 0, "The bottom of the stack must be the root of our fixed-length subtree. It should have it the isValid() case above.");
249
250         auto actionsIterator = actions.find(&activeSubtree.vertex);
251         bool hasActionInsideTree = actionsIterator != actions.end();
252
253         // Stricto sensu, we should count the number of exit edges with fixed length.
254         // That is costly and unlikely to matter in practice.
255         bool hasSingleOutcome = activeSubtree.vertex.edges.size() == 1;
256
257         if (hasActionInsideTree || !hasSingleOutcome) {
258             // Go back to the end of the subtree that has already been generated.
259             // From there, generate everything up to the vertex we found.
260             unsigned end = i;
261             unsigned beginning = end;
262
263             ActiveSubtree* sourceActiveSubtree = nullptr;
264             while (beginning--) {
265                 ActiveSubtree& activeSubtree = stack[beginning];
266                 if (activeSubtree.nfaNode.isValid()) {
267                     sourceActiveSubtree = &activeSubtree;
268                     break;
269                 }
270             }
271             ASSERT_WITH_MESSAGE(sourceActiveSubtree, "The root should always have a valid generator.");
272
273             for (unsigned stackIndex = beginning + 1; stackIndex <= end; ++stackIndex) {
274                 ImmutableCharNFANodeBuilder& sourceNode = sourceActiveSubtree->nfaNode;
275                 ASSERT(sourceNode.isValid());
276                 auto& edge = sourceActiveSubtree->vertex.edges[sourceActiveSubtree->edgeIndex];
277
278                 ActiveSubtree& destinationActiveSubtree = stack[stackIndex];
279                 destinationActiveSubtree.nfaNode = edge.term->generateGraph(nfa, sourceNode, actions.get(&destinationActiveSubtree.vertex));
280
281                 sourceActiveSubtree = &destinationActiveSubtree;
282             }
283
284             return;
285         }
286     }
287 }
288
289 static void generateSuffixWithReverseSuffixTree(NFA& nfa, Vector<ActiveSubtree>& stack, const HashMap<const PrefixTreeVertex*, ActionList>& actions, ReverseSuffixTreeRoots& reverseSuffixTreeRoots)
290 {
291     ActiveSubtree& leafSubtree = stack.last();
292     ASSERT_WITH_MESSAGE(!leafSubtree.nfaNode.isValid(), "The leaf should never be generated by the code above, it should always be inserted into the prefix tree.");
293
294     ActionList actionList = actions.get(&leafSubtree.vertex);
295     ASSERT_WITH_MESSAGE(!actionList.isEmpty(), "Our prefix tree should always have actions on the leaves by construction.");
296
297     HashableActionList hashableActionList(actionList);
298     auto rootAddResult = reverseSuffixTreeRoots.add(hashableActionList, ReverseSuffixTreeVertex());
299     if (rootAddResult.isNewEntry) {
300         ImmutableCharNFANodeBuilder newNode(nfa);
301         newNode.setActions(actionList.begin(), actionList.end());
302         rootAddResult.iterator->value.nodeId = newNode.nodeId();
303     }
304
305     ReverseSuffixTreeVertex* activeReverseSuffixTreeVertex = &rootAddResult.iterator->value;
306     uint32_t destinationNodeId = rootAddResult.iterator->value.nodeId;
307
308     unsigned stackPosition = stack.size() - 2;
309     while (true) {
310         ActiveSubtree& source = stack[stackPosition];
311         auto& edge = source.vertex.edges[source.edgeIndex];
312
313         // This is the end condition: when we meet a node that has already been generated,
314         // we just need to connect our backward tree to the forward tree.
315         //
316         // We *must not* add this last node to the reverse-suffix tree. That node can have
317         // transitions back to earlier part of the prefix tree. If the prefix tree "caches"
318         // such node, it would create new transitions that did not exist in the source language.
319         if (source.nfaNode.isValid()) {
320             stack.shrink(stackPosition + 1);
321             edge.term->generateGraph(nfa, source.nfaNode, destinationNodeId);
322             return;
323         }
324         --stackPosition;
325
326         ASSERT_WITH_MESSAGE(!actions.contains(&source.vertex), "Any node with final actions should have been created before hitting the reverse suffix-tree.");
327
328         ReverseSuffixTreeEdge* existingEdge = nullptr;
329         for (ReverseSuffixTreeEdge& potentialExistingEdge : activeReverseSuffixTreeVertex->edges) {
330             if (edge.term == potentialExistingEdge.term) {
331                 existingEdge = &potentialExistingEdge;
332                 break;
333             }
334         }
335
336         if (existingEdge)
337             activeReverseSuffixTreeVertex = existingEdge->child.get();
338         else {
339             ImmutableCharNFANodeBuilder newNode(nfa);
340             edge.term->generateGraph(nfa, newNode, destinationNodeId);
341             std::unique_ptr<ReverseSuffixTreeVertex> newVertex(new ReverseSuffixTreeVertex());
342             newVertex->nodeId = newNode.nodeId();
343
344             ReverseSuffixTreeVertex* newVertexAddress = newVertex.get();
345             activeReverseSuffixTreeVertex->edges.append(ReverseSuffixTreeEdge({ edge.term, WTFMove(newVertex) }));
346             activeReverseSuffixTreeVertex = newVertexAddress;
347         }
348         destinationNodeId = activeReverseSuffixTreeVertex->nodeId;
349
350         ASSERT(source.vertex.edges.size() == 1);
351         source.vertex.edges.clear();
352     }
353
354     RELEASE_ASSERT_NOT_REACHED();
355 }
356
357 static void clearReverseSuffixTree(ReverseSuffixTreeRoots& reverseSuffixTreeRoots)
358 {
359     // We cannot rely on the destructor being called in order from top to bottom as we may overflow
360     // the stack. Instead, we go depth first in the reverse-suffix-tree.
361
362     for (auto& slot : reverseSuffixTreeRoots) {
363         Vector<ReverseSuffixTreeVertex*, 128> stack;
364         stack.append(&slot.value);
365
366         while (true) {
367             ReverseSuffixTreeVertex* top = stack.last();
368             if (top->edges.isEmpty()) {
369                 stack.removeLast();
370                 if (stack.isEmpty())
371                     break;
372                 stack.last()->edges.removeLast();
373             } else
374                 stack.append(top->edges.last().child.get());
375         }
376     }
377     reverseSuffixTreeRoots.clear();
378 }
379
380 static void generateNFAForSubtree(NFA& nfa, ImmutableCharNFANodeBuilder&& subtreeRoot, PrefixTreeVertex& root, const HashMap<const PrefixTreeVertex*, ActionList>& actions, size_t maxNFASize)
381 {
382     // This recurses the subtree of the prefix tree.
383     // For each edge that has fixed length (no quantifiers like ?, *, or +) it generates the nfa graph,
384     // recurses into children, and deletes any processed leaf nodes.
385
386     ReverseSuffixTreeRoots reverseSuffixTreeRoots;
387     Vector<ActiveSubtree> stack;
388     if (!root.edges.isEmpty())
389         stack.append(ActiveSubtree(root, WTFMove(subtreeRoot), 0));
390
391     bool nfaTooBig = false;
392     
393     // Generate graphs for each subtree that does not contain any quantifiers.
394     while (!stack.isEmpty()) {
395         PrefixTreeVertex& vertex = stack.last().vertex;
396         const unsigned edgeIndex = stack.last().edgeIndex;
397
398         if (edgeIndex < vertex.edges.size()) {
399             auto& edge = vertex.edges[edgeIndex];
400
401             // Clean up any processed leaves and return early if we are past the maxNFASize.
402             if (nfaTooBig) {
403                 stack.last().edgeIndex = stack.last().vertex.edges.size();
404                 continue;
405             }
406             
407             // Quantified edges in the subtree will be a part of another NFA.
408             if (!edge.term->hasFixedLength()) {
409                 stack.last().edgeIndex++;
410                 continue;
411             }
412
413             ASSERT(edge.child.get());
414             ImmutableCharNFANodeBuilder emptyBuilder;
415             stack.append(ActiveSubtree(*edge.child.get(), WTFMove(emptyBuilder), 0));
416         } else {
417             bool isLeaf = vertex.edges.isEmpty();
418
419             ASSERT(edgeIndex == vertex.edges.size());
420             vertex.edges.removeAllMatching([](PrefixTreeEdge& edge)
421             {
422                 return !edge.term;
423             });
424
425             if (isLeaf) {
426                 generateInfixUnsuitableForReverseSuffixTree(nfa, stack, actions);
427                 generateSuffixWithReverseSuffixTree(nfa, stack, actions, reverseSuffixTreeRoots);
428
429                 // Only stop generating an NFA at a leaf to ensure we have a correct NFA. We could go slightly over the maxNFASize.
430                 if (nfa.nodes.size() > maxNFASize)
431                     nfaTooBig = true;
432             } else
433                 stack.removeLast();
434
435             if (!stack.isEmpty()) {
436                 auto& activeSubtree = stack.last();
437                 auto& edge = activeSubtree.vertex.edges[stack.last().edgeIndex];
438                 if (edge.child->edges.isEmpty())
439                     edge.term = nullptr; // Mark this leaf for deleting.
440                 activeSubtree.edgeIndex++;
441             }
442         }
443     }
444     clearReverseSuffixTree(reverseSuffixTreeRoots);
445 }
446
447 void CombinedURLFilters::processNFAs(size_t maxNFASize, const WTF::Function<void(NFA&&)>& handler)
448 {
449 #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
450     print();
451 #endif
452     while (true) {
453         // Traverse out to a leaf.
454         Vector<PrefixTreeVertex*, 128> stack;
455         PrefixTreeVertex* vertex = m_prefixTreeRoot.get();
456         while (true) {
457             ASSERT(vertex);
458             stack.append(vertex);
459             if (vertex->edges.isEmpty())
460                 break;
461             vertex = vertex->edges.last().child.get();
462         }
463         if (stack.size() == 1)
464             break; // We're done once we have processed and removed all the edges in the prefix tree.
465         
466         // Find the prefix root for this NFA. This is the vertex after the last term with a quantifier if there is one,
467         // or the root if there are no quantifiers left.
468         while (stack.size() > 1) {
469             if (!stack[stack.size() - 2]->edges.last().term->hasFixedLength())
470                 break;
471             stack.removeLast();
472         }
473         ASSERT_WITH_MESSAGE(!stack.isEmpty(), "At least the root should be in the stack");
474
475         // Make an NFA with the subtrees for whom this is also the last quantifier (or who also have no quantifier).
476         NFA nfa;
477         {
478             // Put the prefix into the NFA.
479             ImmutableCharNFANodeBuilder lastNode(nfa);
480             for (unsigned i = 0; i < stack.size() - 1; ++i) {
481                 const PrefixTreeEdge& edge = stack[i]->edges.last();
482                 ImmutableCharNFANodeBuilder newNode = edge.term->generateGraph(nfa, lastNode, m_actions.get(edge.child.get()));
483                 lastNode = WTFMove(newNode);
484             }
485
486             // Put the non-quantified vertices in the subtree into the NFA and delete them.
487             ASSERT(stack.last());
488             generateNFAForSubtree(nfa, WTFMove(lastNode), *stack.last(), m_actions, maxNFASize);
489         }
490         nfa.finalize();
491
492         handler(WTFMove(nfa));
493
494         // Clean up any processed leaf nodes.
495         while (true) {
496             if (stack.size() > 1) {
497                 if (stack[stack.size() - 1]->edges.isEmpty()) {
498                     stack[stack.size() - 2]->edges.removeLast();
499                     stack.removeLast();
500                 } else
501                     break; // Vertex is not a leaf.
502             } else
503                 break; // Leave the empty root.
504         }
505     }
506 }
507
508 } // namespace ContentExtensions
509 } // namespace WebCore
510
511 #endif // ENABLE(CONTENT_EXTENSIONS)