// Copyright 2009 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "re2/prefilter_tree.h" #include #include #include #include #include #include #include "absl/log/absl_check.h" #include "absl/log/absl_log.h" #include "absl/strings/str_format.h" #include "re2/prefilter.h" namespace re2 { static const bool ExtraDebug = false; PrefilterTree::PrefilterTree() : compiled_(false), min_atom_len_(3) { } PrefilterTree::PrefilterTree(int min_atom_len) : compiled_(false), min_atom_len_(min_atom_len) { } PrefilterTree::~PrefilterTree() { for (size_t i = 0; i < prefilter_vec_.size(); i++) delete prefilter_vec_[i]; } void PrefilterTree::Add(Prefilter* prefilter) { if (compiled_) { ABSL_LOG(DFATAL) << "Add called after Compile."; return; } if (prefilter != NULL && !KeepNode(prefilter)) { delete prefilter; prefilter = NULL; } prefilter_vec_.push_back(prefilter); } void PrefilterTree::Compile(std::vector* atom_vec) { if (compiled_) { ABSL_LOG(DFATAL) << "Compile called already."; return; } // Some legacy users of PrefilterTree call Compile() before // adding any regexps and expect Compile() to have no effect. if (prefilter_vec_.empty()) { return; } compiled_ = true; NodeSet nodes; AssignUniqueIds(&nodes, atom_vec); if (ExtraDebug) PrintDebugInfo(&nodes); } Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) { NodeSet::const_iterator iter = nodes->find(node); if (iter != nodes->end()) { return *iter; } return NULL; } bool PrefilterTree::KeepNode(Prefilter* node) const { if (node == NULL) return false; switch (node->op()) { default: ABSL_LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op(); return false; case Prefilter::ALL: case Prefilter::NONE: return false; case Prefilter::ATOM: return node->atom().size() >= static_cast(min_atom_len_); case Prefilter::AND: { int j = 0; std::vector* subs = node->subs(); for (size_t i = 0; i < subs->size(); i++) if (KeepNode((*subs)[i])) (*subs)[j++] = (*subs)[i]; else delete (*subs)[i]; subs->resize(j); return j > 0; } case Prefilter::OR: for (size_t i = 0; i < node->subs()->size(); i++) if (!KeepNode((*node->subs())[i])) return false; return true; } } void PrefilterTree::AssignUniqueIds(NodeSet* nodes, std::vector* atom_vec) { atom_vec->clear(); // Build vector of all filter nodes, sorted topologically // from top to bottom in v. std::vector v; // Add the top level nodes of each regexp prefilter. for (size_t i = 0; i < prefilter_vec_.size(); i++) { Prefilter* f = prefilter_vec_[i]; if (f == NULL) unfiltered_.push_back(static_cast(i)); // We push NULL also on to v, so that we maintain the // mapping of index==regexpid for level=0 prefilter nodes. v.push_back(f); } // Now add all the descendant nodes. for (size_t i = 0; i < v.size(); i++) { Prefilter* f = v[i]; if (f == NULL) continue; if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { const std::vector& subs = *f->subs(); for (size_t j = 0; j < subs.size(); j++) v.push_back(subs[j]); } } // Identify unique nodes. int unique_id = 0; for (int i = static_cast(v.size()) - 1; i >= 0; i--) { Prefilter *node = v[i]; if (node == NULL) continue; node->set_unique_id(-1); Prefilter* canonical = CanonicalNode(nodes, node); if (canonical == NULL) { // Any further nodes that have the same atom/subs // will find this node as the canonical node. nodes->emplace(node); if (node->op() == Prefilter::ATOM) { atom_vec->push_back(node->atom()); atom_index_to_id_.push_back(unique_id); } node->set_unique_id(unique_id++); } else { node->set_unique_id(canonical->unique_id()); } } entries_.resize(unique_id); // Fill the entries. for (int i = static_cast(v.size()) - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; if (CanonicalNode(nodes, prefilter) != prefilter) continue; int id = prefilter->unique_id(); switch (prefilter->op()) { default: ABSL_LOG(DFATAL) << "Unexpected op: " << prefilter->op(); return; case Prefilter::ATOM: entries_[id].propagate_up_at_count = 1; break; case Prefilter::OR: case Prefilter::AND: { // For each child, we append our id to the child's list of // parent ids... unless we happen to have done so already. // The number of appends is the number of unique children, // which allows correct upward propagation from AND nodes. int up_count = 0; for (size_t j = 0; j < prefilter->subs()->size(); j++) { int child_id = (*prefilter->subs())[j]->unique_id(); std::vector& parents = entries_[child_id].parents; if (parents.empty() || parents.back() != id) { parents.push_back(id); up_count++; } } entries_[id].propagate_up_at_count = prefilter->op() == Prefilter::AND ? up_count : 1; break; } } } // For top level nodes, populate regexp id. for (size_t i = 0; i < prefilter_vec_.size(); i++) { if (prefilter_vec_[i] == NULL) continue; int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id(); ABSL_DCHECK_LE(0, id); Entry* entry = &entries_[id]; entry->regexps.push_back(static_cast(i)); } // Lastly, using probability-based heuristics, we identify nodes // that trigger too many parents and then we try to prune edges. // We use logarithms below to avoid the likelihood of underflow. double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size()); // Hoisted this above the loop so that we don't thrash the heap. std::vector> entries_by_num_edges; for (int i = static_cast(v.size()) - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; // Pruning applies only to AND nodes because it "just" reduces // precision; applied to OR nodes, it would break correctness. if (prefilter == NULL || prefilter->op() != Prefilter::AND) continue; if (CanonicalNode(nodes, prefilter) != prefilter) continue; int id = prefilter->unique_id(); // Sort the current node's children by the numbers of parents. entries_by_num_edges.clear(); for (size_t j = 0; j < prefilter->subs()->size(); j++) { int child_id = (*prefilter->subs())[j]->unique_id(); const std::vector& parents = entries_[child_id].parents; entries_by_num_edges.emplace_back(parents.size(), child_id); } std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end()); // A running estimate of how many regexps will be triggered by // pruning the remaining children's edges to the current node. // Our nominal target is one, so the threshold is log(1) == 0; // pruning occurs iff the child has more than nine edges left. double log_num_triggered = log_num_regexps; for (const auto& pair : entries_by_num_edges) { int child_id = pair.second; std::vector& parents = entries_[child_id].parents; if (log_num_triggered > 0.) { log_num_triggered += std::log(parents.size()); log_num_triggered -= log_num_regexps; } else if (parents.size() > 9) { auto it = std::find(parents.begin(), parents.end(), id); if (it != parents.end()) { parents.erase(it); entries_[id].propagate_up_at_count--; } } } } } // Functions for triggering during search. void PrefilterTree::RegexpsGivenStrings( const std::vector& matched_atoms, std::vector* regexps) const { regexps->clear(); if (!compiled_) { // Some legacy users of PrefilterTree call Compile() before // adding any regexps and expect Compile() to have no effect. // This kludge is a counterpart to that kludge. if (prefilter_vec_.empty()) { return; } ABSL_LOG(ERROR) << "RegexpsGivenStrings called before Compile."; for (size_t i = 0; i < prefilter_vec_.size(); i++) regexps->push_back(static_cast(i)); } else { IntMap regexps_map(static_cast(prefilter_vec_.size())); std::vector matched_atom_ids; for (size_t j = 0; j < matched_atoms.size(); j++) matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); PropagateMatch(matched_atom_ids, ®exps_map); for (IntMap::const_iterator it = regexps_map.begin(); it != regexps_map.end(); ++it) regexps->push_back(it->index()); regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); } std::sort(regexps->begin(), regexps->end()); } void PrefilterTree::PropagateMatch(const std::vector& atom_ids, IntMap* regexps) const { IntMap count(static_cast(entries_.size())); IntMap work(static_cast(entries_.size())); for (size_t i = 0; i < atom_ids.size(); i++) work.set(atom_ids[i], 1); for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) { const Entry& entry = entries_[it->index()]; // Record regexps triggered. for (size_t i = 0; i < entry.regexps.size(); i++) regexps->set(entry.regexps[i], 1); int c; // Pass trigger up to parents. for (int j : entry.parents) { const Entry& parent = entries_[j]; // Delay until all the children have succeeded. if (parent.propagate_up_at_count > 1) { if (count.has_index(j)) { c = count.get_existing(j) + 1; count.set_existing(j, c); } else { c = 1; count.set_new(j, c); } if (c < parent.propagate_up_at_count) continue; } // Trigger the parent. work.set(j, 1); } } } // Debugging help. void PrefilterTree::PrintPrefilter(int regexpid) { ABSL_LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]); } void PrefilterTree::PrintDebugInfo(NodeSet* nodes) { ABSL_LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size(); ABSL_LOG(ERROR) << "#Unique Nodes: " << entries_.size(); for (size_t i = 0; i < entries_.size(); i++) { const std::vector& parents = entries_[i].parents; const std::vector& regexps = entries_[i].regexps; ABSL_LOG(ERROR) << "EntryId: " << i << " N: " << parents.size() << " R: " << regexps.size(); for (int parent : parents) ABSL_LOG(ERROR) << parent; } ABSL_LOG(ERROR) << "Set:"; for (NodeSet::const_iterator iter = nodes->begin(); iter != nodes->end(); ++iter) ABSL_LOG(ERROR) << "NodeId: " << (*iter)->unique_id(); } std::string PrefilterTree::DebugNodeString(Prefilter* node) const { std::string node_string = ""; if (node->op() == Prefilter::ATOM) { ABSL_DCHECK(!node->atom().empty()); node_string += node->atom(); } else { // Adding the operation disambiguates AND and OR nodes. node_string += node->op() == Prefilter::AND ? "AND" : "OR"; node_string += "("; for (size_t i = 0; i < node->subs()->size(); i++) { if (i > 0) node_string += ','; node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id()); node_string += ":"; node_string += DebugNodeString((*node->subs())[i]); } node_string += ")"; } return node_string; } } // namespace re2