bitstate.cc 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. // Copyright 2008 The RE2 Authors. All Rights Reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Tested by search_test.cc, exhaustive_test.cc, tester.cc
  5. // Prog::SearchBitState is a regular expression search with submatch
  6. // tracking for small regular expressions and texts. Similarly to
  7. // testing/backtrack.cc, it allocates a bitmap with (count of
  8. // lists) * (length of text) bits to make sure it never explores the
  9. // same (instruction list, character position) multiple times. This
  10. // limits the search to run in time linear in the length of the text.
  11. //
  12. // Unlike testing/backtrack.cc, SearchBitState is not recursive
  13. // on the text.
  14. //
  15. // SearchBitState is a fast replacement for the NFA code on small
  16. // regexps and texts when SearchOnePass cannot be used.
  17. #include <stddef.h>
  18. #include <stdint.h>
  19. #include <string.h>
  20. #include <limits>
  21. #include <utility>
  22. #include "util/logging.h"
  23. #include "re2/pod_array.h"
  24. #include "re2/prog.h"
  25. #include "re2/regexp.h"
  26. namespace re2 {
  27. struct Job {
  28. int id;
  29. int rle; // run length encoding
  30. const char* p;
  31. };
  32. class BitState {
  33. public:
  34. explicit BitState(Prog* prog);
  35. // The usual Search prototype.
  36. // Can only call Search once per BitState.
  37. bool Search(const StringPiece& text, const StringPiece& context,
  38. bool anchored, bool longest,
  39. StringPiece* submatch, int nsubmatch);
  40. private:
  41. inline bool ShouldVisit(int id, const char* p);
  42. void Push(int id, const char* p);
  43. void GrowStack();
  44. bool TrySearch(int id, const char* p);
  45. // Search parameters
  46. Prog* prog_; // program being run
  47. StringPiece text_; // text being searched
  48. StringPiece context_; // greater context of text being searched
  49. bool anchored_; // whether search is anchored at text.begin()
  50. bool longest_; // whether search wants leftmost-longest match
  51. bool endmatch_; // whether match must end at text.end()
  52. StringPiece* submatch_; // submatches to fill in
  53. int nsubmatch_; // # of submatches to fill in
  54. // Search state
  55. static constexpr int kVisitedBits = 64;
  56. PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited
  57. PODArray<const char*> cap_; // capture registers
  58. PODArray<Job> job_; // stack of text positions to explore
  59. int njob_; // stack size
  60. BitState(const BitState&) = delete;
  61. BitState& operator=(const BitState&) = delete;
  62. };
  63. BitState::BitState(Prog* prog)
  64. : prog_(prog),
  65. anchored_(false),
  66. longest_(false),
  67. endmatch_(false),
  68. submatch_(NULL),
  69. nsubmatch_(0),
  70. njob_(0) {
  71. }
  72. // Given id, which *must* be a list head, we can look up its list ID.
  73. // Then the question is: Should the search visit the (list ID, p) pair?
  74. // If so, remember that it was visited so that the next time,
  75. // we don't repeat the visit.
  76. bool BitState::ShouldVisit(int id, const char* p) {
  77. int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
  78. static_cast<int>(p-text_.data());
  79. if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
  80. return false;
  81. visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
  82. return true;
  83. }
  84. // Grow the stack.
  85. void BitState::GrowStack() {
  86. PODArray<Job> tmp(2*job_.size());
  87. memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
  88. job_ = std::move(tmp);
  89. }
  90. // Push (id, p) onto the stack, growing it if necessary.
  91. void BitState::Push(int id, const char* p) {
  92. if (njob_ >= job_.size()) {
  93. GrowStack();
  94. if (njob_ >= job_.size()) {
  95. LOG(DFATAL) << "GrowStack() failed: "
  96. << "njob_ = " << njob_ << ", "
  97. << "job_.size() = " << job_.size();
  98. return;
  99. }
  100. }
  101. // If id < 0, it's undoing a Capture,
  102. // so we mustn't interfere with that.
  103. if (id >= 0 && njob_ > 0) {
  104. Job* top = &job_[njob_-1];
  105. if (id == top->id &&
  106. p == top->p + top->rle + 1 &&
  107. top->rle < std::numeric_limits<int>::max()) {
  108. ++top->rle;
  109. return;
  110. }
  111. }
  112. Job* top = &job_[njob_++];
  113. top->id = id;
  114. top->rle = 0;
  115. top->p = p;
  116. }
  117. // Try a search from instruction id0 in state p0.
  118. // Return whether it succeeded.
  119. bool BitState::TrySearch(int id0, const char* p0) {
  120. bool matched = false;
  121. const char* end = text_.data() + text_.size();
  122. njob_ = 0;
  123. // Push() no longer checks ShouldVisit(),
  124. // so we must perform the check ourselves.
  125. if (ShouldVisit(id0, p0))
  126. Push(id0, p0);
  127. while (njob_ > 0) {
  128. // Pop job off stack.
  129. --njob_;
  130. int id = job_[njob_].id;
  131. int& rle = job_[njob_].rle;
  132. const char* p = job_[njob_].p;
  133. if (id < 0) {
  134. // Undo the Capture.
  135. cap_[prog_->inst(-id)->cap()] = p;
  136. continue;
  137. }
  138. if (rle > 0) {
  139. p += rle;
  140. // Revivify job on stack.
  141. --rle;
  142. ++njob_;
  143. }
  144. Loop:
  145. // Visit id, p.
  146. Prog::Inst* ip = prog_->inst(id);
  147. switch (ip->opcode()) {
  148. default:
  149. LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
  150. return false;
  151. case kInstFail:
  152. break;
  153. case kInstAltMatch:
  154. if (ip->greedy(prog_)) {
  155. // out1 is the Match instruction.
  156. id = ip->out1();
  157. p = end;
  158. goto Loop;
  159. }
  160. if (longest_) {
  161. // ip must be non-greedy...
  162. // out is the Match instruction.
  163. id = ip->out();
  164. p = end;
  165. goto Loop;
  166. }
  167. goto Next;
  168. case kInstByteRange: {
  169. int c = -1;
  170. if (p < end)
  171. c = *p & 0xFF;
  172. if (!ip->Matches(c))
  173. goto Next;
  174. if (ip->hint() != 0)
  175. Push(id+ip->hint(), p); // try the next when we're done
  176. id = ip->out();
  177. p++;
  178. goto CheckAndLoop;
  179. }
  180. case kInstCapture:
  181. if (!ip->last())
  182. Push(id+1, p); // try the next when we're done
  183. if (0 <= ip->cap() && ip->cap() < cap_.size()) {
  184. // Capture p to register, but save old value first.
  185. Push(-id, cap_[ip->cap()]); // undo when we're done
  186. cap_[ip->cap()] = p;
  187. }
  188. id = ip->out();
  189. goto CheckAndLoop;
  190. case kInstEmptyWidth:
  191. if (ip->empty() & ~Prog::EmptyFlags(context_, p))
  192. goto Next;
  193. if (!ip->last())
  194. Push(id+1, p); // try the next when we're done
  195. id = ip->out();
  196. goto CheckAndLoop;
  197. case kInstNop:
  198. if (!ip->last())
  199. Push(id+1, p); // try the next when we're done
  200. id = ip->out();
  201. CheckAndLoop:
  202. // Sanity check: id is the head of its list, which must
  203. // be the case if id-1 is the last of *its* list. :)
  204. DCHECK(id == 0 || prog_->inst(id-1)->last());
  205. if (ShouldVisit(id, p))
  206. goto Loop;
  207. break;
  208. case kInstMatch: {
  209. if (endmatch_ && p != end)
  210. goto Next;
  211. // We found a match. If the caller doesn't care
  212. // where the match is, no point going further.
  213. if (nsubmatch_ == 0)
  214. return true;
  215. // Record best match so far.
  216. // Only need to check end point, because this entire
  217. // call is only considering one start position.
  218. matched = true;
  219. cap_[1] = p;
  220. if (submatch_[0].data() == NULL ||
  221. (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
  222. for (int i = 0; i < nsubmatch_; i++)
  223. submatch_[i] =
  224. StringPiece(cap_[2 * i],
  225. static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
  226. }
  227. // If going for first match, we're done.
  228. if (!longest_)
  229. return true;
  230. // If we used the entire text, no longer match is possible.
  231. if (p == end)
  232. return true;
  233. // Otherwise, continue on in hope of a longer match.
  234. // Note the absence of the ShouldVisit() check here
  235. // due to execution remaining in the same list.
  236. Next:
  237. if (!ip->last()) {
  238. id++;
  239. goto Loop;
  240. }
  241. break;
  242. }
  243. }
  244. }
  245. return matched;
  246. }
  247. // Search text (within context) for prog_.
  248. bool BitState::Search(const StringPiece& text, const StringPiece& context,
  249. bool anchored, bool longest,
  250. StringPiece* submatch, int nsubmatch) {
  251. // Search parameters.
  252. text_ = text;
  253. context_ = context;
  254. if (context_.data() == NULL)
  255. context_ = text;
  256. if (prog_->anchor_start() && context_.begin() != text.begin())
  257. return false;
  258. if (prog_->anchor_end() && context_.end() != text.end())
  259. return false;
  260. anchored_ = anchored || prog_->anchor_start();
  261. longest_ = longest || prog_->anchor_end();
  262. endmatch_ = prog_->anchor_end();
  263. submatch_ = submatch;
  264. nsubmatch_ = nsubmatch;
  265. for (int i = 0; i < nsubmatch_; i++)
  266. submatch_[i] = StringPiece();
  267. // Allocate scratch space.
  268. int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
  269. nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
  270. visited_ = PODArray<uint64_t>(nvisited);
  271. memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
  272. int ncap = 2*nsubmatch;
  273. if (ncap < 2)
  274. ncap = 2;
  275. cap_ = PODArray<const char*>(ncap);
  276. memset(cap_.data(), 0, ncap*sizeof cap_[0]);
  277. // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
  278. job_ = PODArray<Job>(64);
  279. // Anchored search must start at text.begin().
  280. if (anchored_) {
  281. cap_[0] = text.data();
  282. return TrySearch(prog_->start(), text.data());
  283. }
  284. // Unanchored search, starting from each possible text position.
  285. // Notice that we have to try the empty string at the end of
  286. // the text, so the loop condition is p <= text.end(), not p < text.end().
  287. // This looks like it's quadratic in the size of the text,
  288. // but we are not clearing visited_ between calls to TrySearch,
  289. // so no work is duplicated and it ends up still being linear.
  290. const char* etext = text.data() + text.size();
  291. for (const char* p = text.data(); p <= etext; p++) {
  292. // Try to use prefix accel (e.g. memchr) to skip ahead.
  293. if (p < etext && prog_->can_prefix_accel()) {
  294. p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
  295. if (p == NULL)
  296. p = etext;
  297. }
  298. cap_[0] = p;
  299. if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
  300. return true;
  301. // Avoid invoking undefined behavior (arithmetic on a null pointer)
  302. // by simply not continuing the loop.
  303. if (p == NULL)
  304. break;
  305. }
  306. return false;
  307. }
  308. // Bit-state search.
  309. bool Prog::SearchBitState(const StringPiece& text,
  310. const StringPiece& context,
  311. Anchor anchor,
  312. MatchKind kind,
  313. StringPiece* match,
  314. int nmatch) {
  315. // If full match, we ask for an anchored longest match
  316. // and then check that match[0] == text.
  317. // So make sure match[0] exists.
  318. StringPiece sp0;
  319. if (kind == kFullMatch) {
  320. anchor = kAnchored;
  321. if (nmatch < 1) {
  322. match = &sp0;
  323. nmatch = 1;
  324. }
  325. }
  326. // Run the search.
  327. BitState b(this);
  328. bool anchored = anchor == kAnchored;
  329. bool longest = kind != kFirstMatch;
  330. if (!b.Search(text, context, anchored, longest, match, nmatch))
  331. return false;
  332. if (kind == kFullMatch && match[0].end() != text.end())
  333. return false;
  334. return true;
  335. }
  336. } // namespace re2