dfa.cc 72 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118
  1. // Copyright 2008 The RE2 Authors. All Rights Reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // A DFA (deterministic finite automaton)-based regular expression search.
  5. //
  6. // The DFA search has two main parts: the construction of the automaton,
  7. // which is represented by a graph of State structures, and the execution
  8. // of the automaton over a given input string.
  9. //
  10. // The basic idea is that the State graph is constructed so that the
  11. // execution can simply start with a state s, and then for each byte c in
  12. // the input string, execute "s = s->next[c]", checking at each point whether
  13. // the current s represents a matching state.
  14. //
  15. // The simple explanation just given does convey the essence of this code,
  16. // but it omits the details of how the State graph gets constructed as well
  17. // as some performance-driven optimizations to the execution of the automaton.
  18. // All these details are explained in the comments for the code following
  19. // the definition of class DFA.
  20. //
  21. // See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.
  22. #include <stddef.h>
  23. #include <stdint.h>
  24. #include <stdio.h>
  25. #include <string.h>
  26. #include <algorithm>
  27. #include <atomic>
  28. #include <deque>
  29. #include <mutex>
  30. #include <new>
  31. #include <string>
  32. #include <unordered_map>
  33. #include <unordered_set>
  34. #include <utility>
  35. #include <vector>
  36. #include "util/logging.h"
  37. #include "util/mix.h"
  38. #include "util/mutex.h"
  39. #include "util/strutil.h"
  40. #include "re2/pod_array.h"
  41. #include "re2/prog.h"
  42. #include "re2/re2.h"
  43. #include "re2/sparse_set.h"
  44. #include "re2/stringpiece.h"
  45. // Silence "zero-sized array in struct/union" warning for DFA::State::next_.
  46. #ifdef _MSC_VER
  47. #pragma warning(disable: 4200)
  48. #endif
  49. namespace re2 {
  50. // Controls whether the DFA should bail out early if the NFA would be faster.
  51. static bool dfa_should_bail_when_slow = true;
  52. void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) {
  53. dfa_should_bail_when_slow = b;
  54. }
  55. // Changing this to true compiles in prints that trace execution of the DFA.
  56. // Generates a lot of output -- only useful for debugging.
  57. static const bool ExtraDebug = false;
  58. // A DFA implementation of a regular expression program.
  59. // Since this is entirely a forward declaration mandated by C++,
  60. // some of the comments here are better understood after reading
  61. // the comments in the sections that follow the DFA definition.
  62. class DFA {
  63. public:
  64. DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem);
  65. ~DFA();
  66. bool ok() const { return !init_failed_; }
  67. Prog::MatchKind kind() { return kind_; }
  68. // Searches for the regular expression in text, which is considered
  69. // as a subsection of context for the purposes of interpreting flags
  70. // like ^ and $ and \A and \z.
  71. // Returns whether a match was found.
  72. // If a match is found, sets *ep to the end point of the best match in text.
  73. // If "anchored", the match must begin at the start of text.
  74. // If "want_earliest_match", the match that ends first is used, not
  75. // necessarily the best one.
  76. // If "run_forward" is true, the DFA runs from text.begin() to text.end().
  77. // If it is false, the DFA runs from text.end() to text.begin(),
  78. // returning the leftmost end of the match instead of the rightmost one.
  79. // If the DFA cannot complete the search (for example, if it is out of
  80. // memory), it sets *failed and returns false.
  81. bool Search(const StringPiece& text, const StringPiece& context,
  82. bool anchored, bool want_earliest_match, bool run_forward,
  83. bool* failed, const char** ep, SparseSet* matches);
  84. // Builds out all states for the entire DFA.
  85. // If cb is not empty, it receives one callback per state built.
  86. // Returns the number of states built.
  87. // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
  88. int BuildAllStates(const Prog::DFAStateCallback& cb);
  89. // Computes min and max for matching strings. Won't return strings
  90. // bigger than maxlen.
  91. bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
  92. // These data structures are logically private, but C++ makes it too
  93. // difficult to mark them as such.
  94. class RWLocker;
  95. class StateSaver;
  96. class Workq;
  97. // A single DFA state. The DFA is represented as a graph of these
  98. // States, linked by the next_ pointers. If in state s and reading
  99. // byte c, the next state should be s->next_[c].
  100. struct State {
  101. inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; }
  102. int* inst_; // Instruction pointers in the state.
  103. int ninst_; // # of inst_ pointers.
  104. uint32_t flag_; // Empty string bitfield flags in effect on the way
  105. // into this state, along with kFlagMatch if this
  106. // is a matching state.
  107. // Work around the bug affecting flexible array members in GCC 6.x (for x >= 1).
  108. // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932)
  109. #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1
  110. std::atomic<State*> next_[0]; // Outgoing arrows from State,
  111. #else
  112. std::atomic<State*> next_[]; // Outgoing arrows from State,
  113. #endif
  114. // one per input byte class
  115. };
  116. enum {
  117. kByteEndText = 256, // imaginary byte at end of text
  118. kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags
  119. kFlagMatch = 0x0100, // State.flag_: this is a matching state
  120. kFlagLastWord = 0x0200, // State.flag_: last byte was a word char
  121. kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left
  122. };
  123. struct StateHash {
  124. size_t operator()(const State* a) const {
  125. DCHECK(a != NULL);
  126. HashMix mix(a->flag_);
  127. for (int i = 0; i < a->ninst_; i++)
  128. mix.Mix(a->inst_[i]);
  129. mix.Mix(0);
  130. return mix.get();
  131. }
  132. };
  133. struct StateEqual {
  134. bool operator()(const State* a, const State* b) const {
  135. DCHECK(a != NULL);
  136. DCHECK(b != NULL);
  137. if (a == b)
  138. return true;
  139. if (a->flag_ != b->flag_)
  140. return false;
  141. if (a->ninst_ != b->ninst_)
  142. return false;
  143. for (int i = 0; i < a->ninst_; i++)
  144. if (a->inst_[i] != b->inst_[i])
  145. return false;
  146. return true;
  147. }
  148. };
  149. typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
  150. private:
  151. // Make it easier to swap in a scalable reader-writer mutex.
  152. using CacheMutex = Mutex;
  153. enum {
  154. // Indices into start_ for unanchored searches.
  155. // Add kStartAnchored for anchored searches.
  156. kStartBeginText = 0, // text at beginning of context
  157. kStartBeginLine = 2, // text at beginning of line
  158. kStartAfterWordChar = 4, // text follows a word character
  159. kStartAfterNonWordChar = 6, // text follows non-word character
  160. kMaxStart = 8,
  161. kStartAnchored = 1,
  162. };
  163. // Resets the DFA State cache, flushing all saved State* information.
  164. // Releases and reacquires cache_mutex_ via cache_lock, so any
  165. // State* existing before the call are not valid after the call.
  166. // Use a StateSaver to preserve important states across the call.
  167. // cache_mutex_.r <= L < mutex_
  168. // After: cache_mutex_.w <= L < mutex_
  169. void ResetCache(RWLocker* cache_lock);
  170. // Looks up and returns the State corresponding to a Workq.
  171. // L >= mutex_
  172. State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag);
  173. // Looks up and returns a State matching the inst, ninst, and flag.
  174. // L >= mutex_
  175. State* CachedState(int* inst, int ninst, uint32_t flag);
  176. // Clear the cache entirely.
  177. // Must hold cache_mutex_.w or be in destructor.
  178. void ClearCache();
  179. // Converts a State into a Workq: the opposite of WorkqToCachedState.
  180. // L >= mutex_
  181. void StateToWorkq(State* s, Workq* q);
  182. // Runs a State on a given byte, returning the next state.
  183. State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_
  184. State* RunStateOnByte(State*, int); // L >= mutex_
  185. // Runs a Workq on a given byte followed by a set of empty-string flags,
  186. // producing a new Workq in nq. If a match instruction is encountered,
  187. // sets *ismatch to true.
  188. // L >= mutex_
  189. void RunWorkqOnByte(Workq* q, Workq* nq,
  190. int c, uint32_t flag, bool* ismatch);
  191. // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
  192. // L >= mutex_
  193. void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint32_t flag);
  194. // Adds the instruction id to the Workq, following empty arrows
  195. // according to flag.
  196. // L >= mutex_
  197. void AddToQueue(Workq* q, int id, uint32_t flag);
  198. // For debugging, returns a text representation of State.
  199. static std::string DumpState(State* state);
  200. // For debugging, returns a text representation of a Workq.
  201. static std::string DumpWorkq(Workq* q);
  202. // Search parameters
  203. struct SearchParams {
  204. SearchParams(const StringPiece& text, const StringPiece& context,
  205. RWLocker* cache_lock)
  206. : text(text),
  207. context(context),
  208. anchored(false),
  209. can_prefix_accel(false),
  210. want_earliest_match(false),
  211. run_forward(false),
  212. start(NULL),
  213. cache_lock(cache_lock),
  214. failed(false),
  215. ep(NULL),
  216. matches(NULL) {}
  217. StringPiece text;
  218. StringPiece context;
  219. bool anchored;
  220. bool can_prefix_accel;
  221. bool want_earliest_match;
  222. bool run_forward;
  223. State* start;
  224. RWLocker* cache_lock;
  225. bool failed; // "out" parameter: whether search gave up
  226. const char* ep; // "out" parameter: end pointer for match
  227. SparseSet* matches;
  228. private:
  229. SearchParams(const SearchParams&) = delete;
  230. SearchParams& operator=(const SearchParams&) = delete;
  231. };
  232. // Before each search, the parameters to Search are analyzed by
  233. // AnalyzeSearch to determine the state in which to start.
  234. struct StartInfo {
  235. StartInfo() : start(NULL) {}
  236. std::atomic<State*> start;
  237. };
  238. // Fills in params->start and params->can_prefix_accel using
  239. // the other search parameters. Returns true on success,
  240. // false on failure.
  241. // cache_mutex_.r <= L < mutex_
  242. bool AnalyzeSearch(SearchParams* params);
  243. bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
  244. uint32_t flags);
  245. // The generic search loop, inlined to create specialized versions.
  246. // cache_mutex_.r <= L < mutex_
  247. // Might unlock and relock cache_mutex_ via params->cache_lock.
  248. template <bool can_prefix_accel,
  249. bool want_earliest_match,
  250. bool run_forward>
  251. inline bool InlinedSearchLoop(SearchParams* params);
  252. // The specialized versions of InlinedSearchLoop. The three letters
  253. // at the ends of the name denote the true/false values used as the
  254. // last three parameters of InlinedSearchLoop.
  255. // cache_mutex_.r <= L < mutex_
  256. // Might unlock and relock cache_mutex_ via params->cache_lock.
  257. bool SearchFFF(SearchParams* params);
  258. bool SearchFFT(SearchParams* params);
  259. bool SearchFTF(SearchParams* params);
  260. bool SearchFTT(SearchParams* params);
  261. bool SearchTFF(SearchParams* params);
  262. bool SearchTFT(SearchParams* params);
  263. bool SearchTTF(SearchParams* params);
  264. bool SearchTTT(SearchParams* params);
  265. // The main search loop: calls an appropriate specialized version of
  266. // InlinedSearchLoop.
  267. // cache_mutex_.r <= L < mutex_
  268. // Might unlock and relock cache_mutex_ via params->cache_lock.
  269. bool FastSearchLoop(SearchParams* params);
  270. // Looks up bytes in bytemap_ but handles case c == kByteEndText too.
  271. int ByteMap(int c) {
  272. if (c == kByteEndText)
  273. return prog_->bytemap_range();
  274. return prog_->bytemap()[c];
  275. }
  276. // Constant after initialization.
  277. Prog* prog_; // The regular expression program to run.
  278. Prog::MatchKind kind_; // The kind of DFA.
  279. bool init_failed_; // initialization failed (out of memory)
  280. Mutex mutex_; // mutex_ >= cache_mutex_.r
  281. // Scratch areas, protected by mutex_.
  282. Workq* q0_; // Two pre-allocated work queues.
  283. Workq* q1_;
  284. PODArray<int> stack_; // Pre-allocated stack for AddToQueue
  285. // State* cache. Many threads use and add to the cache simultaneously,
  286. // holding cache_mutex_ for reading and mutex_ (above) when adding.
  287. // If the cache fills and needs to be discarded, the discarding is done
  288. // while holding cache_mutex_ for writing, to avoid interrupting other
  289. // readers. Any State* pointers are only valid while cache_mutex_
  290. // is held.
  291. CacheMutex cache_mutex_;
  292. int64_t mem_budget_; // Total memory budget for all States.
  293. int64_t state_budget_; // Amount of memory remaining for new States.
  294. StateSet state_cache_; // All States computed so far.
  295. StartInfo start_[kMaxStart];
  296. DFA(const DFA&) = delete;
  297. DFA& operator=(const DFA&) = delete;
  298. };
  299. // Shorthand for casting to uint8_t*.
  300. static inline const uint8_t* BytePtr(const void* v) {
  301. return reinterpret_cast<const uint8_t*>(v);
  302. }
  303. // Work queues
  304. // Marks separate thread groups of different priority
  305. // in the work queue when in leftmost-longest matching mode.
  306. #define Mark (-1)
  307. // Separates the match IDs from the instructions in inst_.
  308. // Used only for "many match" DFA states.
  309. #define MatchSep (-2)
  310. // Internally, the DFA uses a sparse array of
  311. // program instruction pointers as a work queue.
  312. // In leftmost longest mode, marks separate sections
  313. // of workq that started executing at different
  314. // locations in the string (earlier locations first).
  315. class DFA::Workq : public SparseSet {
  316. public:
  317. // Constructor: n is number of normal slots, maxmark number of mark slots.
  318. Workq(int n, int maxmark) :
  319. SparseSet(n+maxmark),
  320. n_(n),
  321. maxmark_(maxmark),
  322. nextmark_(n),
  323. last_was_mark_(true) {
  324. }
  325. bool is_mark(int i) { return i >= n_; }
  326. int maxmark() { return maxmark_; }
  327. void clear() {
  328. SparseSet::clear();
  329. nextmark_ = n_;
  330. }
  331. void mark() {
  332. if (last_was_mark_)
  333. return;
  334. last_was_mark_ = false;
  335. SparseSet::insert_new(nextmark_++);
  336. }
  337. int size() {
  338. return n_ + maxmark_;
  339. }
  340. void insert(int id) {
  341. if (contains(id))
  342. return;
  343. insert_new(id);
  344. }
  345. void insert_new(int id) {
  346. last_was_mark_ = false;
  347. SparseSet::insert_new(id);
  348. }
  349. private:
  350. int n_; // size excluding marks
  351. int maxmark_; // maximum number of marks
  352. int nextmark_; // id of next mark
  353. bool last_was_mark_; // last inserted was mark
  354. Workq(const Workq&) = delete;
  355. Workq& operator=(const Workq&) = delete;
  356. };
  357. DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem)
  358. : prog_(prog),
  359. kind_(kind),
  360. init_failed_(false),
  361. q0_(NULL),
  362. q1_(NULL),
  363. mem_budget_(max_mem) {
  364. if (ExtraDebug)
  365. fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str());
  366. int nmark = 0;
  367. if (kind_ == Prog::kLongestMatch)
  368. nmark = prog_->size();
  369. // See DFA::AddToQueue() for why this is so.
  370. int nstack = prog_->inst_count(kInstCapture) +
  371. prog_->inst_count(kInstEmptyWidth) +
  372. prog_->inst_count(kInstNop) +
  373. nmark + 1; // + 1 for start inst
  374. // Account for space needed for DFA, q0, q1, stack.
  375. mem_budget_ -= sizeof(DFA);
  376. mem_budget_ -= (prog_->size() + nmark) *
  377. (sizeof(int)+sizeof(int)) * 2; // q0, q1
  378. mem_budget_ -= nstack * sizeof(int); // stack
  379. if (mem_budget_ < 0) {
  380. init_failed_ = true;
  381. return;
  382. }
  383. state_budget_ = mem_budget_;
  384. // Make sure there is a reasonable amount of working room left.
  385. // At minimum, the search requires room for two states in order
  386. // to limp along, restarting frequently. We'll get better performance
  387. // if there is room for a larger number of states, say 20.
  388. // Note that a state stores list heads only, so we use the program
  389. // list count for the upper bound, not the program size.
  390. int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
  391. int64_t one_state = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
  392. (prog_->list_count()+nmark)*sizeof(int);
  393. if (state_budget_ < 20*one_state) {
  394. init_failed_ = true;
  395. return;
  396. }
  397. q0_ = new Workq(prog_->size(), nmark);
  398. q1_ = new Workq(prog_->size(), nmark);
  399. stack_ = PODArray<int>(nstack);
  400. }
  401. DFA::~DFA() {
  402. delete q0_;
  403. delete q1_;
  404. ClearCache();
  405. }
  406. // In the DFA state graph, s->next[c] == NULL means that the
  407. // state has not yet been computed and needs to be. We need
  408. // a different special value to signal that s->next[c] is a
  409. // state that can never lead to a match (and thus the search
  410. // can be called off). Hence DeadState.
  411. #define DeadState reinterpret_cast<State*>(1)
  412. // Signals that the rest of the string matches no matter what it is.
  413. #define FullMatchState reinterpret_cast<State*>(2)
  414. #define SpecialStateMax FullMatchState
  415. // Debugging printouts
  416. // For debugging, returns a string representation of the work queue.
  417. std::string DFA::DumpWorkq(Workq* q) {
  418. std::string s;
  419. const char* sep = "";
  420. for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
  421. if (q->is_mark(*it)) {
  422. s += "|";
  423. sep = "";
  424. } else {
  425. s += StringPrintf("%s%d", sep, *it);
  426. sep = ",";
  427. }
  428. }
  429. return s;
  430. }
  431. // For debugging, returns a string representation of the state.
  432. std::string DFA::DumpState(State* state) {
  433. if (state == NULL)
  434. return "_";
  435. if (state == DeadState)
  436. return "X";
  437. if (state == FullMatchState)
  438. return "*";
  439. std::string s;
  440. const char* sep = "";
  441. s += StringPrintf("(%p)", state);
  442. for (int i = 0; i < state->ninst_; i++) {
  443. if (state->inst_[i] == Mark) {
  444. s += "|";
  445. sep = "";
  446. } else if (state->inst_[i] == MatchSep) {
  447. s += "||";
  448. sep = "";
  449. } else {
  450. s += StringPrintf("%s%d", sep, state->inst_[i]);
  451. sep = ",";
  452. }
  453. }
  454. s += StringPrintf(" flag=%#x", state->flag_);
  455. return s;
  456. }
  457. //////////////////////////////////////////////////////////////////////
  458. //
  459. // DFA state graph construction.
  460. //
  461. // The DFA state graph is a heavily-linked collection of State* structures.
  462. // The state_cache_ is a set of all the State structures ever allocated,
  463. // so that if the same state is reached by two different paths,
  464. // the same State structure can be used. This reduces allocation
  465. // requirements and also avoids duplication of effort across the two
  466. // identical states.
  467. //
  468. // A State is defined by an ordered list of instruction ids and a flag word.
  469. //
  470. // The choice of an ordered list of instructions differs from a typical
  471. // textbook DFA implementation, which would use an unordered set.
  472. // Textbook descriptions, however, only care about whether
  473. // the DFA matches, not where it matches in the text. To decide where the
  474. // DFA matches, we need to mimic the behavior of the dominant backtracking
  475. // implementations like PCRE, which try one possible regular expression
  476. // execution, then another, then another, stopping when one of them succeeds.
  477. // The DFA execution tries these many executions in parallel, representing
  478. // each by an instruction id. These pointers are ordered in the State.inst_
  479. // list in the same order that the executions would happen in a backtracking
  480. // search: if a match is found during execution of inst_[2], inst_[i] for i>=3
  481. // can be discarded.
  482. //
  483. // Textbooks also typically do not consider context-aware empty string operators
  484. // like ^ or $. These are handled by the flag word, which specifies the set
  485. // of empty-string operators that should be matched when executing at the
  486. // current text position. These flag bits are defined in prog.h.
  487. // The flag word also contains two DFA-specific bits: kFlagMatch if the state
  488. // is a matching state (one that reached a kInstMatch in the program)
  489. // and kFlagLastWord if the last processed byte was a word character, for the
  490. // implementation of \B and \b.
  491. //
  492. // The flag word also contains, shifted up 16 bits, the bits looked for by
  493. // any kInstEmptyWidth instructions in the state. These provide a useful
  494. // summary indicating when new flags might be useful.
  495. //
  496. // The permanent representation of a State's instruction ids is just an array,
  497. // but while a state is being analyzed, these instruction ids are represented
  498. // as a Workq, which is an array that allows iteration in insertion order.
  499. // NOTE(rsc): The choice of State construction determines whether the DFA
  500. // mimics backtracking implementations (so-called leftmost first matching) or
  501. // traditional DFA implementations (so-called leftmost longest matching as
  502. // prescribed by POSIX). This implementation chooses to mimic the
  503. // backtracking implementations, because we want to replace PCRE. To get
  504. // POSIX behavior, the states would need to be considered not as a simple
  505. // ordered list of instruction ids, but as a list of unordered sets of instruction
  506. // ids. A match by a state in one set would inhibit the running of sets
  507. // farther down the list but not other instruction ids in the same set. Each
  508. // set would correspond to matches beginning at a given point in the string.
  509. // This is implemented by separating different sets with Mark pointers.
  510. // Looks in the State cache for a State matching q, flag.
  511. // If one is found, returns it. If one is not found, allocates one,
  512. // inserts it in the cache, and returns it.
  513. // If mq is not null, MatchSep and the match IDs in mq will be appended
  514. // to the State.
  515. DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
  516. //mutex_.AssertHeld();
  517. // Construct array of instruction ids for the new state.
  518. // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
  519. // those are the only operators with any effect in
  520. // RunWorkqOnEmptyString or RunWorkqOnByte.
  521. PODArray<int> inst(q->size());
  522. int n = 0;
  523. uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions
  524. bool sawmatch = false; // whether queue contains guaranteed kInstMatch
  525. bool sawmark = false; // whether queue contains a Mark
  526. if (ExtraDebug)
  527. fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
  528. for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
  529. int id = *it;
  530. if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
  531. break;
  532. if (q->is_mark(id)) {
  533. if (n > 0 && inst[n-1] != Mark) {
  534. sawmark = true;
  535. inst[n++] = Mark;
  536. }
  537. continue;
  538. }
  539. Prog::Inst* ip = prog_->inst(id);
  540. switch (ip->opcode()) {
  541. case kInstAltMatch:
  542. // This state will continue to a match no matter what
  543. // the rest of the input is. If it is the highest priority match
  544. // being considered, return the special FullMatchState
  545. // to indicate that it's all matches from here out.
  546. if (kind_ != Prog::kManyMatch &&
  547. (kind_ != Prog::kFirstMatch ||
  548. (it == q->begin() && ip->greedy(prog_))) &&
  549. (kind_ != Prog::kLongestMatch || !sawmark) &&
  550. (flag & kFlagMatch)) {
  551. if (ExtraDebug)
  552. fprintf(stderr, " -> FullMatchState\n");
  553. return FullMatchState;
  554. }
  555. FALLTHROUGH_INTENDED;
  556. default:
  557. // Record iff id is the head of its list, which must
  558. // be the case if id-1 is the last of *its* list. :)
  559. if (prog_->inst(id-1)->last())
  560. inst[n++] = *it;
  561. if (ip->opcode() == kInstEmptyWidth)
  562. needflags |= ip->empty();
  563. if (ip->opcode() == kInstMatch && !prog_->anchor_end())
  564. sawmatch = true;
  565. break;
  566. }
  567. }
  568. DCHECK_LE(n, q->size());
  569. if (n > 0 && inst[n-1] == Mark)
  570. n--;
  571. // If there are no empty-width instructions waiting to execute,
  572. // then the extra flag bits will not be used, so there is no
  573. // point in saving them. (Discarding them reduces the number
  574. // of distinct states.)
  575. if (needflags == 0)
  576. flag &= kFlagMatch;
  577. // NOTE(rsc): The code above cannot do flag &= needflags,
  578. // because if the right flags were present to pass the current
  579. // kInstEmptyWidth instructions, new kInstEmptyWidth instructions
  580. // might be reached that in turn need different flags.
  581. // The only sure thing is that if there are no kInstEmptyWidth
  582. // instructions at all, no flags will be needed.
  583. // We could do the extra work to figure out the full set of
  584. // possibly needed flags by exploring past the kInstEmptyWidth
  585. // instructions, but the check above -- are any flags needed
  586. // at all? -- handles the most common case. More fine-grained
  587. // analysis can only be justified by measurements showing that
  588. // too many redundant states are being allocated.
  589. // If there are no Insts in the list, it's a dead state,
  590. // which is useful to signal with a special pointer so that
  591. // the execution loop can stop early. This is only okay
  592. // if the state is *not* a matching state.
  593. if (n == 0 && flag == 0) {
  594. if (ExtraDebug)
  595. fprintf(stderr, " -> DeadState\n");
  596. return DeadState;
  597. }
  598. // If we're in longest match mode, the state is a sequence of
  599. // unordered state sets separated by Marks. Sort each set
  600. // to canonicalize, to reduce the number of distinct sets stored.
  601. if (kind_ == Prog::kLongestMatch) {
  602. int* ip = inst.data();
  603. int* ep = ip + n;
  604. while (ip < ep) {
  605. int* markp = ip;
  606. while (markp < ep && *markp != Mark)
  607. markp++;
  608. std::sort(ip, markp);
  609. if (markp < ep)
  610. markp++;
  611. ip = markp;
  612. }
  613. }
  614. // If we're in many match mode, canonicalize for similar reasons:
  615. // we have an unordered set of states (i.e. we don't have Marks)
  616. // and sorting will reduce the number of distinct sets stored.
  617. if (kind_ == Prog::kManyMatch) {
  618. int* ip = inst.data();
  619. int* ep = ip + n;
  620. std::sort(ip, ep);
  621. }
  622. // Append MatchSep and the match IDs in mq if necessary.
  623. if (mq != NULL) {
  624. inst[n++] = MatchSep;
  625. for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) {
  626. int id = *i;
  627. Prog::Inst* ip = prog_->inst(id);
  628. if (ip->opcode() == kInstMatch)
  629. inst[n++] = ip->match_id();
  630. }
  631. }
  632. // Save the needed empty-width flags in the top bits for use later.
  633. flag |= needflags << kFlagNeedShift;
  634. State* state = CachedState(inst.data(), n, flag);
  635. return state;
  636. }
  637. // Looks in the State cache for a State matching inst, ninst, flag.
  638. // If one is found, returns it. If one is not found, allocates one,
  639. // inserts it in the cache, and returns it.
  640. DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) {
  641. //mutex_.AssertHeld();
  642. // Look in the cache for a pre-existing state.
  643. // We have to initialise the struct like this because otherwise
  644. // MSVC will complain about the flexible array member. :(
  645. State state;
  646. state.inst_ = inst;
  647. state.ninst_ = ninst;
  648. state.flag_ = flag;
  649. StateSet::iterator it = state_cache_.find(&state);
  650. if (it != state_cache_.end()) {
  651. if (ExtraDebug)
  652. fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str());
  653. return *it;
  654. }
  655. // Must have enough memory for new state.
  656. // In addition to what we're going to allocate,
  657. // the state cache hash table seems to incur about 40 bytes per
  658. // State*, empirically.
  659. const int kStateCacheOverhead = 40;
  660. int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
  661. int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
  662. ninst*sizeof(int);
  663. if (mem_budget_ < mem + kStateCacheOverhead) {
  664. mem_budget_ = -1;
  665. return NULL;
  666. }
  667. mem_budget_ -= mem + kStateCacheOverhead;
  668. // Allocate new state along with room for next_ and inst_.
  669. char* space = std::allocator<char>().allocate(mem);
  670. State* s = new (space) State;
  671. (void) new (s->next_) std::atomic<State*>[nnext];
  672. // Work around a unfortunate bug in older versions of libstdc++.
  673. // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658)
  674. for (int i = 0; i < nnext; i++)
  675. (void) new (s->next_ + i) std::atomic<State*>(NULL);
  676. s->inst_ = new (s->next_ + nnext) int[ninst];
  677. memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);
  678. s->ninst_ = ninst;
  679. s->flag_ = flag;
  680. if (ExtraDebug)
  681. fprintf(stderr, " -> %s\n", DumpState(s).c_str());
  682. // Put state in cache and return it.
  683. state_cache_.insert(s);
  684. return s;
  685. }
  686. // Clear the cache. Must hold cache_mutex_.w or be in destructor.
  687. void DFA::ClearCache() {
  688. StateSet::iterator begin = state_cache_.begin();
  689. StateSet::iterator end = state_cache_.end();
  690. while (begin != end) {
  691. StateSet::iterator tmp = begin;
  692. ++begin;
  693. // Deallocate the blob of memory that we allocated in DFA::CachedState().
  694. // We recompute mem in order to benefit from sized delete where possible.
  695. int ninst = (*tmp)->ninst_;
  696. int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
  697. int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
  698. ninst*sizeof(int);
  699. std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem);
  700. }
  701. state_cache_.clear();
  702. }
  703. // Copies insts in state s to the work queue q.
  704. void DFA::StateToWorkq(State* s, Workq* q) {
  705. q->clear();
  706. for (int i = 0; i < s->ninst_; i++) {
  707. if (s->inst_[i] == Mark) {
  708. q->mark();
  709. } else if (s->inst_[i] == MatchSep) {
  710. // Nothing after this is an instruction!
  711. break;
  712. } else {
  713. // Explore from the head of the list.
  714. AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask);
  715. }
  716. }
  717. }
  718. // Adds ip to the work queue, following empty arrows according to flag.
  719. void DFA::AddToQueue(Workq* q, int id, uint32_t flag) {
  720. // Use stack_ to hold our stack of instructions yet to process.
  721. // It was preallocated as follows:
  722. // one entry per Capture;
  723. // one entry per EmptyWidth; and
  724. // one entry per Nop.
  725. // This reflects the maximum number of stack pushes that each can
  726. // perform. (Each instruction can be processed at most once.)
  727. // When using marks, we also added nmark == prog_->size().
  728. // (Otherwise, nmark == 0.)
  729. int* stk = stack_.data();
  730. int nstk = 0;
  731. stk[nstk++] = id;
  732. while (nstk > 0) {
  733. DCHECK_LE(nstk, stack_.size());
  734. id = stk[--nstk];
  735. Loop:
  736. if (id == Mark) {
  737. q->mark();
  738. continue;
  739. }
  740. if (id == 0)
  741. continue;
  742. // If ip is already on the queue, nothing to do.
  743. // Otherwise add it. We don't actually keep all the
  744. // ones that get added, but adding all of them here
  745. // increases the likelihood of q->contains(id),
  746. // reducing the amount of duplicated work.
  747. if (q->contains(id))
  748. continue;
  749. q->insert_new(id);
  750. // Process instruction.
  751. Prog::Inst* ip = prog_->inst(id);
  752. switch (ip->opcode()) {
  753. default:
  754. LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
  755. break;
  756. case kInstByteRange: // just save these on the queue
  757. case kInstMatch:
  758. if (ip->last())
  759. break;
  760. id = id+1;
  761. goto Loop;
  762. case kInstCapture: // DFA treats captures as no-ops.
  763. case kInstNop:
  764. if (!ip->last())
  765. stk[nstk++] = id+1;
  766. // If this instruction is the [00-FF]* loop at the beginning of
  767. // a leftmost-longest unanchored search, separate with a Mark so
  768. // that future threads (which will start farther to the right in
  769. // the input string) are lower priority than current threads.
  770. if (ip->opcode() == kInstNop && q->maxmark() > 0 &&
  771. id == prog_->start_unanchored() && id != prog_->start())
  772. stk[nstk++] = Mark;
  773. id = ip->out();
  774. goto Loop;
  775. case kInstAltMatch:
  776. DCHECK(!ip->last());
  777. id = id+1;
  778. goto Loop;
  779. case kInstEmptyWidth:
  780. if (!ip->last())
  781. stk[nstk++] = id+1;
  782. // Continue on if we have all the right flag bits.
  783. if (ip->empty() & ~flag)
  784. break;
  785. id = ip->out();
  786. goto Loop;
  787. }
  788. }
  789. }
  790. // Running of work queues. In the work queue, order matters:
  791. // the queue is sorted in priority order. If instruction i comes before j,
  792. // then the instructions that i produces during the run must come before
  793. // the ones that j produces. In order to keep this invariant, all the
  794. // work queue runners have to take an old queue to process and then
  795. // also a new queue to fill in. It's not acceptable to add to the end of
  796. // an existing queue, because new instructions will not end up in the
  797. // correct position.
  798. // Runs the work queue, processing the empty strings indicated by flag.
  799. // For example, flag == kEmptyBeginLine|kEmptyEndLine means to match
  800. // both ^ and $. It is important that callers pass all flags at once:
  801. // processing both ^ and $ is not the same as first processing only ^
  802. // and then processing only $. Doing the two-step sequence won't match
  803. // ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior
  804. // exhibited by existing implementations).
  805. void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) {
  806. newq->clear();
  807. for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
  808. if (oldq->is_mark(*i))
  809. AddToQueue(newq, Mark, flag);
  810. else
  811. AddToQueue(newq, *i, flag);
  812. }
  813. }
  814. // Runs the work queue, processing the single byte c followed by any empty
  815. // strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine,
  816. // means to match c$. Sets the bool *ismatch to true if the end of the
  817. // regular expression program has been reached (the regexp has matched).
  818. void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
  819. int c, uint32_t flag, bool* ismatch) {
  820. //mutex_.AssertHeld();
  821. newq->clear();
  822. for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
  823. if (oldq->is_mark(*i)) {
  824. if (*ismatch)
  825. return;
  826. newq->mark();
  827. continue;
  828. }
  829. int id = *i;
  830. Prog::Inst* ip = prog_->inst(id);
  831. switch (ip->opcode()) {
  832. default:
  833. LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
  834. break;
  835. case kInstFail: // never succeeds
  836. case kInstCapture: // already followed
  837. case kInstNop: // already followed
  838. case kInstAltMatch: // already followed
  839. case kInstEmptyWidth: // already followed
  840. break;
  841. case kInstByteRange: // can follow if c is in range
  842. if (!ip->Matches(c))
  843. break;
  844. AddToQueue(newq, ip->out(), flag);
  845. if (ip->hint() != 0) {
  846. // We have a hint, but we must cancel out the
  847. // increment that will occur after the break.
  848. i += ip->hint() - 1;
  849. } else {
  850. // We have no hint, so we must find the end
  851. // of the current list and then skip to it.
  852. Prog::Inst* ip0 = ip;
  853. while (!ip->last())
  854. ++ip;
  855. i += ip - ip0;
  856. }
  857. break;
  858. case kInstMatch:
  859. if (prog_->anchor_end() && c != kByteEndText &&
  860. kind_ != Prog::kManyMatch)
  861. break;
  862. *ismatch = true;
  863. if (kind_ == Prog::kFirstMatch) {
  864. // Can stop processing work queue since we found a match.
  865. return;
  866. }
  867. break;
  868. }
  869. }
  870. if (ExtraDebug)
  871. fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n",
  872. DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch);
  873. }
  874. // Processes input byte c in state, returning new state.
  875. // Caller does not hold mutex.
  876. DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
  877. // Keep only one RunStateOnByte going
  878. // even if the DFA is being run by multiple threads.
  879. MutexLock l(&mutex_);
  880. return RunStateOnByte(state, c);
  881. }
  882. // Processes input byte c in state, returning new state.
  883. DFA::State* DFA::RunStateOnByte(State* state, int c) {
  884. //mutex_.AssertHeld();
  885. if (state <= SpecialStateMax) {
  886. if (state == FullMatchState) {
  887. // It is convenient for routines like PossibleMatchRange
  888. // if we implement RunStateOnByte for FullMatchState:
  889. // once you get into this state you never get out,
  890. // so it's pretty easy.
  891. return FullMatchState;
  892. }
  893. if (state == DeadState) {
  894. LOG(DFATAL) << "DeadState in RunStateOnByte";
  895. return NULL;
  896. }
  897. if (state == NULL) {
  898. LOG(DFATAL) << "NULL state in RunStateOnByte";
  899. return NULL;
  900. }
  901. LOG(DFATAL) << "Unexpected special state in RunStateOnByte";
  902. return NULL;
  903. }
  904. // If someone else already computed this, return it.
  905. State* ns = state->next_[ByteMap(c)].load(std::memory_order_relaxed);
  906. if (ns != NULL)
  907. return ns;
  908. // Convert state into Workq.
  909. StateToWorkq(state, q0_);
  910. // Flags marking the kinds of empty-width things (^ $ etc)
  911. // around this byte. Before the byte we have the flags recorded
  912. // in the State structure itself. After the byte we have
  913. // nothing yet (but that will change: read on).
  914. uint32_t needflag = state->flag_ >> kFlagNeedShift;
  915. uint32_t beforeflag = state->flag_ & kFlagEmptyMask;
  916. uint32_t oldbeforeflag = beforeflag;
  917. uint32_t afterflag = 0;
  918. if (c == '\n') {
  919. // Insert implicit $ and ^ around \n
  920. beforeflag |= kEmptyEndLine;
  921. afterflag |= kEmptyBeginLine;
  922. }
  923. if (c == kByteEndText) {
  924. // Insert implicit $ and \z before the fake "end text" byte.
  925. beforeflag |= kEmptyEndLine | kEmptyEndText;
  926. }
  927. // The state flag kFlagLastWord says whether the last
  928. // byte processed was a word character. Use that info to
  929. // insert empty-width (non-)word boundaries.
  930. bool islastword = (state->flag_ & kFlagLastWord) != 0;
  931. bool isword = c != kByteEndText && Prog::IsWordChar(static_cast<uint8_t>(c));
  932. if (isword == islastword)
  933. beforeflag |= kEmptyNonWordBoundary;
  934. else
  935. beforeflag |= kEmptyWordBoundary;
  936. // Okay, finally ready to run.
  937. // Only useful to rerun on empty string if there are new, useful flags.
  938. if (beforeflag & ~oldbeforeflag & needflag) {
  939. RunWorkqOnEmptyString(q0_, q1_, beforeflag);
  940. using std::swap;
  941. swap(q0_, q1_);
  942. }
  943. bool ismatch = false;
  944. RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch);
  945. using std::swap;
  946. swap(q0_, q1_);
  947. // Save afterflag along with ismatch and isword in new state.
  948. uint32_t flag = afterflag;
  949. if (ismatch)
  950. flag |= kFlagMatch;
  951. if (isword)
  952. flag |= kFlagLastWord;
  953. if (ismatch && kind_ == Prog::kManyMatch)
  954. ns = WorkqToCachedState(q0_, q1_, flag);
  955. else
  956. ns = WorkqToCachedState(q0_, NULL, flag);
  957. // Flush ns before linking to it.
  958. // Write barrier before updating state->next_ so that the
  959. // main search loop can proceed without any locking, for speed.
  960. // (Otherwise it would need one mutex operation per input byte.)
  961. state->next_[ByteMap(c)].store(ns, std::memory_order_release);
  962. return ns;
  963. }
  964. //////////////////////////////////////////////////////////////////////
  965. // DFA cache reset.
  966. // Reader-writer lock helper.
  967. //
  968. // The DFA uses a reader-writer mutex to protect the state graph itself.
  969. // Traversing the state graph requires holding the mutex for reading,
  970. // and discarding the state graph and starting over requires holding the
  971. // lock for writing. If a search needs to expand the graph but is out
  972. // of memory, it will need to drop its read lock and then acquire the
  973. // write lock. Since it cannot then atomically downgrade from write lock
  974. // to read lock, it runs the rest of the search holding the write lock.
  975. // (This probably helps avoid repeated contention, but really the decision
  976. // is forced by the Mutex interface.) It's a bit complicated to keep
  977. // track of whether the lock is held for reading or writing and thread
  978. // that through the search, so instead we encapsulate it in the RWLocker
  979. // and pass that around.
  980. class DFA::RWLocker {
  981. public:
  982. explicit RWLocker(CacheMutex* mu);
  983. ~RWLocker();
  984. // If the lock is only held for reading right now,
  985. // drop the read lock and re-acquire for writing.
  986. // Subsequent calls to LockForWriting are no-ops.
  987. // Notice that the lock is *released* temporarily.
  988. void LockForWriting();
  989. private:
  990. CacheMutex* mu_;
  991. bool writing_;
  992. RWLocker(const RWLocker&) = delete;
  993. RWLocker& operator=(const RWLocker&) = delete;
  994. };
  995. DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) {
  996. mu_->ReaderLock();
  997. }
  998. // This function is marked as NO_THREAD_SAFETY_ANALYSIS because
  999. // the annotations don't support lock upgrade.
  1000. void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
  1001. if (!writing_) {
  1002. mu_->ReaderUnlock();
  1003. mu_->WriterLock();
  1004. writing_ = true;
  1005. }
  1006. }
  1007. DFA::RWLocker::~RWLocker() {
  1008. if (!writing_)
  1009. mu_->ReaderUnlock();
  1010. else
  1011. mu_->WriterUnlock();
  1012. }
  1013. // When the DFA's State cache fills, we discard all the states in the
  1014. // cache and start over. Many threads can be using and adding to the
  1015. // cache at the same time, so we synchronize using the cache_mutex_
  1016. // to keep from stepping on other threads. Specifically, all the
  1017. // threads using the current cache hold cache_mutex_ for reading.
  1018. // When a thread decides to flush the cache, it drops cache_mutex_
  1019. // and then re-acquires it for writing. That ensures there are no
  1020. // other threads accessing the cache anymore. The rest of the search
  1021. // runs holding cache_mutex_ for writing, avoiding any contention
  1022. // with or cache pollution caused by other threads.
  1023. void DFA::ResetCache(RWLocker* cache_lock) {
  1024. // Re-acquire the cache_mutex_ for writing (exclusive use).
  1025. cache_lock->LockForWriting();
  1026. hooks::GetDFAStateCacheResetHook()({
  1027. state_budget_,
  1028. state_cache_.size(),
  1029. });
  1030. // Clear the cache, reset the memory budget.
  1031. for (int i = 0; i < kMaxStart; i++)
  1032. start_[i].start.store(NULL, std::memory_order_relaxed);
  1033. ClearCache();
  1034. mem_budget_ = state_budget_;
  1035. }
  1036. // Typically, a couple States do need to be preserved across a cache
  1037. // reset, like the State at the current point in the search.
  1038. // The StateSaver class helps keep States across cache resets.
  1039. // It makes a copy of the state's guts outside the cache (before the reset)
  1040. // and then can be asked, after the reset, to recreate the State
  1041. // in the new cache. For example, in a DFA method ("this" is a DFA):
  1042. //
  1043. // StateSaver saver(this, s);
  1044. // ResetCache(cache_lock);
  1045. // s = saver.Restore();
  1046. //
  1047. // The saver should always have room in the cache to re-create the state,
  1048. // because resetting the cache locks out all other threads, and the cache
  1049. // is known to have room for at least a couple states (otherwise the DFA
  1050. // constructor fails).
  1051. class DFA::StateSaver {
  1052. public:
  1053. explicit StateSaver(DFA* dfa, State* state);
  1054. ~StateSaver();
  1055. // Recreates and returns a state equivalent to the
  1056. // original state passed to the constructor.
  1057. // Returns NULL if the cache has filled, but
  1058. // since the DFA guarantees to have room in the cache
  1059. // for a couple states, should never return NULL
  1060. // if used right after ResetCache.
  1061. State* Restore();
  1062. private:
  1063. DFA* dfa_; // the DFA to use
  1064. int* inst_; // saved info from State
  1065. int ninst_;
  1066. uint32_t flag_;
  1067. bool is_special_; // whether original state was special
  1068. State* special_; // if is_special_, the original state
  1069. StateSaver(const StateSaver&) = delete;
  1070. StateSaver& operator=(const StateSaver&) = delete;
  1071. };
  1072. DFA::StateSaver::StateSaver(DFA* dfa, State* state) {
  1073. dfa_ = dfa;
  1074. if (state <= SpecialStateMax) {
  1075. inst_ = NULL;
  1076. ninst_ = 0;
  1077. flag_ = 0;
  1078. is_special_ = true;
  1079. special_ = state;
  1080. return;
  1081. }
  1082. is_special_ = false;
  1083. special_ = NULL;
  1084. flag_ = state->flag_;
  1085. ninst_ = state->ninst_;
  1086. inst_ = new int[ninst_];
  1087. memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
  1088. }
  1089. DFA::StateSaver::~StateSaver() {
  1090. if (!is_special_)
  1091. delete[] inst_;
  1092. }
  1093. DFA::State* DFA::StateSaver::Restore() {
  1094. if (is_special_)
  1095. return special_;
  1096. MutexLock l(&dfa_->mutex_);
  1097. State* s = dfa_->CachedState(inst_, ninst_, flag_);
  1098. if (s == NULL)
  1099. LOG(DFATAL) << "StateSaver failed to restore state.";
  1100. return s;
  1101. }
  1102. //////////////////////////////////////////////////////////////////////
  1103. //
  1104. // DFA execution.
  1105. //
  1106. // The basic search loop is easy: start in a state s and then for each
  1107. // byte c in the input, s = s->next[c].
  1108. //
  1109. // This simple description omits a few efficiency-driven complications.
  1110. //
  1111. // First, the State graph is constructed incrementally: it is possible
  1112. // that s->next[c] is null, indicating that that state has not been
  1113. // fully explored. In this case, RunStateOnByte must be invoked to
  1114. // determine the next state, which is cached in s->next[c] to save
  1115. // future effort. An alternative reason for s->next[c] to be null is
  1116. // that the DFA has reached a so-called "dead state", in which any match
  1117. // is no longer possible. In this case RunStateOnByte will return NULL
  1118. // and the processing of the string can stop early.
  1119. //
  1120. // Second, a 256-element pointer array for s->next_ makes each State
  1121. // quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[]
  1122. // maps from bytes to "byte classes" and then next_ only needs to have
  1123. // as many pointers as there are byte classes. A byte class is simply a
  1124. // range of bytes that the regexp never distinguishes between.
  1125. // A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,
  1126. // 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit
  1127. // but in exchange we typically cut the size of a State (and thus our
  1128. // memory footprint) by about 5-10x. The comments still refer to
  1129. // s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].
  1130. //
  1131. // Third, it is common for a DFA for an unanchored match to begin in a
  1132. // state in which only one particular byte value can take the DFA to a
  1133. // different state. That is, s->next[c] != s for only one c. In this
  1134. // situation, the DFA can do better than executing the simple loop.
  1135. // Instead, it can call memchr to search very quickly for the byte c.
  1136. // Whether the start state has this property is determined during a
  1137. // pre-compilation pass and the "can_prefix_accel" argument is set.
  1138. //
  1139. // Fourth, the desired behavior is to search for the leftmost-best match
  1140. // (approximately, the same one that Perl would find), which is not
  1141. // necessarily the match ending earliest in the string. Each time a
  1142. // match is found, it must be noted, but the DFA must continue on in
  1143. // hope of finding a higher-priority match. In some cases, the caller only
  1144. // cares whether there is any match at all, not which one is found.
  1145. // The "want_earliest_match" flag causes the search to stop at the first
  1146. // match found.
  1147. //
  1148. // Fifth, one algorithm that uses the DFA needs it to run over the
  1149. // input string backward, beginning at the end and ending at the beginning.
  1150. // Passing false for the "run_forward" flag causes the DFA to run backward.
  1151. //
  1152. // The checks for these last three cases, which in a naive implementation
  1153. // would be performed once per input byte, slow the general loop enough
  1154. // to merit specialized versions of the search loop for each of the
  1155. // eight possible settings of the three booleans. Rather than write
  1156. // eight different functions, we write one general implementation and then
  1157. // inline it to create the specialized ones.
  1158. //
  1159. // Note that matches are delayed by one byte, to make it easier to
  1160. // accomodate match conditions depending on the next input byte (like $ and \b).
  1161. // When s->next[c]->IsMatch(), it means that there is a match ending just
  1162. // *before* byte c.
  1163. // The generic search loop. Searches text for a match, returning
  1164. // the pointer to the end of the chosen match, or NULL if no match.
  1165. // The bools are equal to the same-named variables in params, but
  1166. // making them function arguments lets the inliner specialize
  1167. // this function to each combination (see two paragraphs above).
  1168. template <bool can_prefix_accel,
  1169. bool want_earliest_match,
  1170. bool run_forward>
  1171. inline bool DFA::InlinedSearchLoop(SearchParams* params) {
  1172. State* start = params->start;
  1173. const uint8_t* bp = BytePtr(params->text.data()); // start of text
  1174. const uint8_t* p = bp; // text scanning point
  1175. const uint8_t* ep = BytePtr(params->text.data() +
  1176. params->text.size()); // end of text
  1177. const uint8_t* resetp = NULL; // p at last cache reset
  1178. if (!run_forward) {
  1179. using std::swap;
  1180. swap(p, ep);
  1181. }
  1182. const uint8_t* bytemap = prog_->bytemap();
  1183. const uint8_t* lastmatch = NULL; // most recent matching position in text
  1184. bool matched = false;
  1185. State* s = start;
  1186. if (ExtraDebug)
  1187. fprintf(stderr, "@stx: %s\n", DumpState(s).c_str());
  1188. if (s->IsMatch()) {
  1189. matched = true;
  1190. lastmatch = p;
  1191. if (ExtraDebug)
  1192. fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str());
  1193. if (params->matches != NULL && kind_ == Prog::kManyMatch) {
  1194. for (int i = s->ninst_ - 1; i >= 0; i--) {
  1195. int id = s->inst_[i];
  1196. if (id == MatchSep)
  1197. break;
  1198. params->matches->insert(id);
  1199. }
  1200. }
  1201. if (want_earliest_match) {
  1202. params->ep = reinterpret_cast<const char*>(lastmatch);
  1203. return true;
  1204. }
  1205. }
  1206. while (p != ep) {
  1207. if (ExtraDebug)
  1208. fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str());
  1209. if (can_prefix_accel && s == start) {
  1210. // In start state, only way out is to find the prefix,
  1211. // so we use prefix accel (e.g. memchr) to skip ahead.
  1212. // If not found, we can skip to the end of the string.
  1213. p = BytePtr(prog_->PrefixAccel(p, ep - p));
  1214. if (p == NULL) {
  1215. p = ep;
  1216. break;
  1217. }
  1218. }
  1219. int c;
  1220. if (run_forward)
  1221. c = *p++;
  1222. else
  1223. c = *--p;
  1224. // Note that multiple threads might be consulting
  1225. // s->next_[bytemap[c]] simultaneously.
  1226. // RunStateOnByte takes care of the appropriate locking,
  1227. // including a memory barrier so that the unlocked access
  1228. // (sometimes known as "double-checked locking") is safe.
  1229. // The alternative would be either one DFA per thread
  1230. // or one mutex operation per input byte.
  1231. //
  1232. // ns == DeadState means the state is known to be dead
  1233. // (no more matches are possible).
  1234. // ns == NULL means the state has not yet been computed
  1235. // (need to call RunStateOnByteUnlocked).
  1236. // RunStateOnByte returns ns == NULL if it is out of memory.
  1237. // ns == FullMatchState means the rest of the string matches.
  1238. //
  1239. // Okay to use bytemap[] not ByteMap() here, because
  1240. // c is known to be an actual byte and not kByteEndText.
  1241. State* ns = s->next_[bytemap[c]].load(std::memory_order_acquire);
  1242. if (ns == NULL) {
  1243. ns = RunStateOnByteUnlocked(s, c);
  1244. if (ns == NULL) {
  1245. // After we reset the cache, we hold cache_mutex exclusively,
  1246. // so if resetp != NULL, it means we filled the DFA state
  1247. // cache with this search alone (without any other threads).
  1248. // Benchmarks show that doing a state computation on every
  1249. // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the
  1250. // same at about 2 MB/s. Unless we're processing an average
  1251. // of 10 bytes per state computation, fail so that RE2 can
  1252. // fall back to the NFA. However, RE2::Set cannot fall back,
  1253. // so we just have to keep on keeping on in that case.
  1254. if (dfa_should_bail_when_slow && resetp != NULL &&
  1255. static_cast<size_t>(p - resetp) < 10*state_cache_.size() &&
  1256. kind_ != Prog::kManyMatch) {
  1257. params->failed = true;
  1258. return false;
  1259. }
  1260. resetp = p;
  1261. // Prepare to save start and s across the reset.
  1262. StateSaver save_start(this, start);
  1263. StateSaver save_s(this, s);
  1264. // Discard all the States in the cache.
  1265. ResetCache(params->cache_lock);
  1266. // Restore start and s so we can continue.
  1267. if ((start = save_start.Restore()) == NULL ||
  1268. (s = save_s.Restore()) == NULL) {
  1269. // Restore already did LOG(DFATAL).
  1270. params->failed = true;
  1271. return false;
  1272. }
  1273. ns = RunStateOnByteUnlocked(s, c);
  1274. if (ns == NULL) {
  1275. LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";
  1276. params->failed = true;
  1277. return false;
  1278. }
  1279. }
  1280. }
  1281. if (ns <= SpecialStateMax) {
  1282. if (ns == DeadState) {
  1283. params->ep = reinterpret_cast<const char*>(lastmatch);
  1284. return matched;
  1285. }
  1286. // FullMatchState
  1287. params->ep = reinterpret_cast<const char*>(ep);
  1288. return true;
  1289. }
  1290. s = ns;
  1291. if (s->IsMatch()) {
  1292. matched = true;
  1293. // The DFA notices the match one byte late,
  1294. // so adjust p before using it in the match.
  1295. if (run_forward)
  1296. lastmatch = p - 1;
  1297. else
  1298. lastmatch = p + 1;
  1299. if (ExtraDebug)
  1300. fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str());
  1301. if (params->matches != NULL && kind_ == Prog::kManyMatch) {
  1302. for (int i = s->ninst_ - 1; i >= 0; i--) {
  1303. int id = s->inst_[i];
  1304. if (id == MatchSep)
  1305. break;
  1306. params->matches->insert(id);
  1307. }
  1308. }
  1309. if (want_earliest_match) {
  1310. params->ep = reinterpret_cast<const char*>(lastmatch);
  1311. return true;
  1312. }
  1313. }
  1314. }
  1315. // Process one more byte to see if it triggers a match.
  1316. // (Remember, matches are delayed one byte.)
  1317. if (ExtraDebug)
  1318. fprintf(stderr, "@etx: %s\n", DumpState(s).c_str());
  1319. int lastbyte;
  1320. if (run_forward) {
  1321. if (params->text.end() == params->context.end())
  1322. lastbyte = kByteEndText;
  1323. else
  1324. lastbyte = params->text.end()[0] & 0xFF;
  1325. } else {
  1326. if (params->text.begin() == params->context.begin())
  1327. lastbyte = kByteEndText;
  1328. else
  1329. lastbyte = params->text.begin()[-1] & 0xFF;
  1330. }
  1331. State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire);
  1332. if (ns == NULL) {
  1333. ns = RunStateOnByteUnlocked(s, lastbyte);
  1334. if (ns == NULL) {
  1335. StateSaver save_s(this, s);
  1336. ResetCache(params->cache_lock);
  1337. if ((s = save_s.Restore()) == NULL) {
  1338. params->failed = true;
  1339. return false;
  1340. }
  1341. ns = RunStateOnByteUnlocked(s, lastbyte);
  1342. if (ns == NULL) {
  1343. LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";
  1344. params->failed = true;
  1345. return false;
  1346. }
  1347. }
  1348. }
  1349. if (ns <= SpecialStateMax) {
  1350. if (ns == DeadState) {
  1351. params->ep = reinterpret_cast<const char*>(lastmatch);
  1352. return matched;
  1353. }
  1354. // FullMatchState
  1355. params->ep = reinterpret_cast<const char*>(ep);
  1356. return true;
  1357. }
  1358. s = ns;
  1359. if (s->IsMatch()) {
  1360. matched = true;
  1361. lastmatch = p;
  1362. if (ExtraDebug)
  1363. fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str());
  1364. if (params->matches != NULL && kind_ == Prog::kManyMatch) {
  1365. for (int i = s->ninst_ - 1; i >= 0; i--) {
  1366. int id = s->inst_[i];
  1367. if (id == MatchSep)
  1368. break;
  1369. params->matches->insert(id);
  1370. }
  1371. }
  1372. }
  1373. params->ep = reinterpret_cast<const char*>(lastmatch);
  1374. return matched;
  1375. }
  1376. // Inline specializations of the general loop.
  1377. bool DFA::SearchFFF(SearchParams* params) {
  1378. return InlinedSearchLoop<false, false, false>(params);
  1379. }
  1380. bool DFA::SearchFFT(SearchParams* params) {
  1381. return InlinedSearchLoop<false, false, true>(params);
  1382. }
  1383. bool DFA::SearchFTF(SearchParams* params) {
  1384. return InlinedSearchLoop<false, true, false>(params);
  1385. }
  1386. bool DFA::SearchFTT(SearchParams* params) {
  1387. return InlinedSearchLoop<false, true, true>(params);
  1388. }
  1389. bool DFA::SearchTFF(SearchParams* params) {
  1390. return InlinedSearchLoop<true, false, false>(params);
  1391. }
  1392. bool DFA::SearchTFT(SearchParams* params) {
  1393. return InlinedSearchLoop<true, false, true>(params);
  1394. }
  1395. bool DFA::SearchTTF(SearchParams* params) {
  1396. return InlinedSearchLoop<true, true, false>(params);
  1397. }
  1398. bool DFA::SearchTTT(SearchParams* params) {
  1399. return InlinedSearchLoop<true, true, true>(params);
  1400. }
  1401. // For performance, calls the appropriate specialized version
  1402. // of InlinedSearchLoop.
  1403. bool DFA::FastSearchLoop(SearchParams* params) {
  1404. // Because the methods are private, the Searches array
  1405. // cannot be declared at top level.
  1406. static bool (DFA::*Searches[])(SearchParams*) = {
  1407. &DFA::SearchFFF,
  1408. &DFA::SearchFFT,
  1409. &DFA::SearchFTF,
  1410. &DFA::SearchFTT,
  1411. &DFA::SearchTFF,
  1412. &DFA::SearchTFT,
  1413. &DFA::SearchTTF,
  1414. &DFA::SearchTTT,
  1415. };
  1416. int index = 4 * params->can_prefix_accel +
  1417. 2 * params->want_earliest_match +
  1418. 1 * params->run_forward;
  1419. return (this->*Searches[index])(params);
  1420. }
  1421. // The discussion of DFA execution above ignored the question of how
  1422. // to determine the initial state for the search loop. There are two
  1423. // factors that influence the choice of start state.
  1424. //
  1425. // The first factor is whether the search is anchored or not.
  1426. // The regexp program (Prog*) itself has
  1427. // two different entry points: one for anchored searches and one for
  1428. // unanchored searches. (The unanchored version starts with a leading ".*?"
  1429. // and then jumps to the anchored one.)
  1430. //
  1431. // The second factor is where text appears in the larger context, which
  1432. // determines which empty-string operators can be matched at the beginning
  1433. // of execution. If text is at the very beginning of context, \A and ^ match.
  1434. // Otherwise if text is at the beginning of a line, then ^ matches.
  1435. // Otherwise it matters whether the character before text is a word character
  1436. // or a non-word character.
  1437. //
  1438. // The two cases (unanchored vs not) and four cases (empty-string flags)
  1439. // combine to make the eight cases recorded in the DFA's begin_text_[2],
  1440. // begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached
  1441. // StartInfos. The start state for each is filled in the first time it
  1442. // is used for an actual search.
  1443. // Examines text, context, and anchored to determine the right start
  1444. // state for the DFA search loop. Fills in params and returns true on success.
  1445. // Returns false on failure.
  1446. bool DFA::AnalyzeSearch(SearchParams* params) {
  1447. const StringPiece& text = params->text;
  1448. const StringPiece& context = params->context;
  1449. // Sanity check: make sure that text lies within context.
  1450. if (text.begin() < context.begin() || text.end() > context.end()) {
  1451. LOG(DFATAL) << "context does not contain text";
  1452. params->start = DeadState;
  1453. return true;
  1454. }
  1455. // Determine correct search type.
  1456. int start;
  1457. uint32_t flags;
  1458. if (params->run_forward) {
  1459. if (text.begin() == context.begin()) {
  1460. start = kStartBeginText;
  1461. flags = kEmptyBeginText|kEmptyBeginLine;
  1462. } else if (text.begin()[-1] == '\n') {
  1463. start = kStartBeginLine;
  1464. flags = kEmptyBeginLine;
  1465. } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) {
  1466. start = kStartAfterWordChar;
  1467. flags = kFlagLastWord;
  1468. } else {
  1469. start = kStartAfterNonWordChar;
  1470. flags = 0;
  1471. }
  1472. } else {
  1473. if (text.end() == context.end()) {
  1474. start = kStartBeginText;
  1475. flags = kEmptyBeginText|kEmptyBeginLine;
  1476. } else if (text.end()[0] == '\n') {
  1477. start = kStartBeginLine;
  1478. flags = kEmptyBeginLine;
  1479. } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) {
  1480. start = kStartAfterWordChar;
  1481. flags = kFlagLastWord;
  1482. } else {
  1483. start = kStartAfterNonWordChar;
  1484. flags = 0;
  1485. }
  1486. }
  1487. if (params->anchored)
  1488. start |= kStartAnchored;
  1489. StartInfo* info = &start_[start];
  1490. // Try once without cache_lock for writing.
  1491. // Try again after resetting the cache
  1492. // (ResetCache will relock cache_lock for writing).
  1493. if (!AnalyzeSearchHelper(params, info, flags)) {
  1494. ResetCache(params->cache_lock);
  1495. if (!AnalyzeSearchHelper(params, info, flags)) {
  1496. LOG(DFATAL) << "Failed to analyze start state.";
  1497. params->failed = true;
  1498. return false;
  1499. }
  1500. }
  1501. params->start = info->start.load(std::memory_order_acquire);
  1502. // Even if we could prefix accel, we cannot do so when anchored and,
  1503. // less obviously, we cannot do so when we are going to need flags.
  1504. // This trick works only when there is a single byte that leads to a
  1505. // different state!
  1506. if (prog_->can_prefix_accel() &&
  1507. !params->anchored &&
  1508. params->start > SpecialStateMax &&
  1509. params->start->flag_ >> kFlagNeedShift == 0)
  1510. params->can_prefix_accel = true;
  1511. if (ExtraDebug)
  1512. fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
  1513. params->anchored, params->run_forward, flags,
  1514. DumpState(params->start).c_str(), params->can_prefix_accel);
  1515. return true;
  1516. }
  1517. // Fills in info if needed. Returns true on success, false on failure.
  1518. bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
  1519. uint32_t flags) {
  1520. // Quick check.
  1521. State* start = info->start.load(std::memory_order_acquire);
  1522. if (start != NULL)
  1523. return true;
  1524. MutexLock l(&mutex_);
  1525. start = info->start.load(std::memory_order_relaxed);
  1526. if (start != NULL)
  1527. return true;
  1528. q0_->clear();
  1529. AddToQueue(q0_,
  1530. params->anchored ? prog_->start() : prog_->start_unanchored(),
  1531. flags);
  1532. start = WorkqToCachedState(q0_, NULL, flags);
  1533. if (start == NULL)
  1534. return false;
  1535. // Synchronize with "quick check" above.
  1536. info->start.store(start, std::memory_order_release);
  1537. return true;
  1538. }
  1539. // The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
  1540. bool DFA::Search(const StringPiece& text,
  1541. const StringPiece& context,
  1542. bool anchored,
  1543. bool want_earliest_match,
  1544. bool run_forward,
  1545. bool* failed,
  1546. const char** epp,
  1547. SparseSet* matches) {
  1548. *epp = NULL;
  1549. if (!ok()) {
  1550. *failed = true;
  1551. return false;
  1552. }
  1553. *failed = false;
  1554. if (ExtraDebug) {
  1555. fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());
  1556. fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
  1557. std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_);
  1558. }
  1559. RWLocker l(&cache_mutex_);
  1560. SearchParams params(text, context, &l);
  1561. params.anchored = anchored;
  1562. params.want_earliest_match = want_earliest_match;
  1563. params.run_forward = run_forward;
  1564. params.matches = matches;
  1565. if (!AnalyzeSearch(&params)) {
  1566. *failed = true;
  1567. return false;
  1568. }
  1569. if (params.start == DeadState)
  1570. return false;
  1571. if (params.start == FullMatchState) {
  1572. if (run_forward == want_earliest_match)
  1573. *epp = text.data();
  1574. else
  1575. *epp = text.data() + text.size();
  1576. return true;
  1577. }
  1578. if (ExtraDebug)
  1579. fprintf(stderr, "start %s\n", DumpState(params.start).c_str());
  1580. bool ret = FastSearchLoop(&params);
  1581. if (params.failed) {
  1582. *failed = true;
  1583. return false;
  1584. }
  1585. *epp = params.ep;
  1586. return ret;
  1587. }
  1588. DFA* Prog::GetDFA(MatchKind kind) {
  1589. // For a forward DFA, half the memory goes to each DFA.
  1590. // However, if it is a "many match" DFA, then there is
  1591. // no counterpart with which the memory must be shared.
  1592. //
  1593. // For a reverse DFA, all the memory goes to the
  1594. // "longest match" DFA, because RE2 never does reverse
  1595. // "first match" searches.
  1596. if (kind == kFirstMatch) {
  1597. std::call_once(dfa_first_once_, [](Prog* prog) {
  1598. prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2);
  1599. }, this);
  1600. return dfa_first_;
  1601. } else if (kind == kManyMatch) {
  1602. std::call_once(dfa_first_once_, [](Prog* prog) {
  1603. prog->dfa_first_ = new DFA(prog, kManyMatch, prog->dfa_mem_);
  1604. }, this);
  1605. return dfa_first_;
  1606. } else {
  1607. std::call_once(dfa_longest_once_, [](Prog* prog) {
  1608. if (!prog->reversed_)
  1609. prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_ / 2);
  1610. else
  1611. prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_);
  1612. }, this);
  1613. return dfa_longest_;
  1614. }
  1615. }
  1616. void Prog::DeleteDFA(DFA* dfa) {
  1617. delete dfa;
  1618. }
  1619. // Executes the regexp program to search in text,
  1620. // which itself is inside the larger context. (As a convenience,
  1621. // passing a NULL context is equivalent to passing text.)
  1622. // Returns true if a match is found, false if not.
  1623. // If a match is found, fills in match0->end() to point at the end of the match
  1624. // and sets match0->begin() to text.begin(), since the DFA can't track
  1625. // where the match actually began.
  1626. //
  1627. // This is the only external interface (class DFA only exists in this file).
  1628. //
  1629. bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
  1630. Anchor anchor, MatchKind kind, StringPiece* match0,
  1631. bool* failed, SparseSet* matches) {
  1632. *failed = false;
  1633. StringPiece context = const_context;
  1634. if (context.data() == NULL)
  1635. context = text;
  1636. bool caret = anchor_start();
  1637. bool dollar = anchor_end();
  1638. if (reversed_) {
  1639. using std::swap;
  1640. swap(caret, dollar);
  1641. }
  1642. if (caret && context.begin() != text.begin())
  1643. return false;
  1644. if (dollar && context.end() != text.end())
  1645. return false;
  1646. // Handle full match by running an anchored longest match
  1647. // and then checking if it covers all of text.
  1648. bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
  1649. bool endmatch = false;
  1650. if (kind == kManyMatch) {
  1651. // This is split out in order to avoid clobbering kind.
  1652. } else if (kind == kFullMatch || anchor_end()) {
  1653. endmatch = true;
  1654. kind = kLongestMatch;
  1655. }
  1656. // If the caller doesn't care where the match is (just whether one exists),
  1657. // then we can stop at the very first match we find, the so-called
  1658. // "earliest match".
  1659. bool want_earliest_match = false;
  1660. if (kind == kManyMatch) {
  1661. // This is split out in order to avoid clobbering kind.
  1662. if (matches == NULL) {
  1663. want_earliest_match = true;
  1664. }
  1665. } else if (match0 == NULL && !endmatch) {
  1666. want_earliest_match = true;
  1667. kind = kLongestMatch;
  1668. }
  1669. DFA* dfa = GetDFA(kind);
  1670. const char* ep;
  1671. bool matched = dfa->Search(text, context, anchored,
  1672. want_earliest_match, !reversed_,
  1673. failed, &ep, matches);
  1674. if (*failed) {
  1675. hooks::GetDFASearchFailureHook()({
  1676. // Nothing yet...
  1677. });
  1678. return false;
  1679. }
  1680. if (!matched)
  1681. return false;
  1682. if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size()))
  1683. return false;
  1684. // If caller cares, record the boundary of the match.
  1685. // We only know where it ends, so use the boundary of text
  1686. // as the beginning.
  1687. if (match0) {
  1688. if (reversed_)
  1689. *match0 =
  1690. StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep));
  1691. else
  1692. *match0 =
  1693. StringPiece(text.data(), static_cast<size_t>(ep - text.data()));
  1694. }
  1695. return true;
  1696. }
  1697. // Build out all states in DFA. Returns number of states.
  1698. int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
  1699. if (!ok())
  1700. return 0;
  1701. // Pick out start state for unanchored search
  1702. // at beginning of text.
  1703. RWLocker l(&cache_mutex_);
  1704. SearchParams params(StringPiece(), StringPiece(), &l);
  1705. params.anchored = false;
  1706. if (!AnalyzeSearch(&params) ||
  1707. params.start == NULL ||
  1708. params.start == DeadState)
  1709. return 0;
  1710. // Add start state to work queue.
  1711. // Note that any State* that we handle here must point into the cache,
  1712. // so we can simply depend on pointer-as-a-number hashing and equality.
  1713. std::unordered_map<State*, int> m;
  1714. std::deque<State*> q;
  1715. m.emplace(params.start, static_cast<int>(m.size()));
  1716. q.push_back(params.start);
  1717. // Compute the input bytes needed to cover all of the next pointers.
  1718. int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
  1719. std::vector<int> input(nnext);
  1720. for (int c = 0; c < 256; c++) {
  1721. int b = prog_->bytemap()[c];
  1722. while (c < 256-1 && prog_->bytemap()[c+1] == b)
  1723. c++;
  1724. input[b] = c;
  1725. }
  1726. input[prog_->bytemap_range()] = kByteEndText;
  1727. // Scratch space for the output.
  1728. std::vector<int> output(nnext);
  1729. // Flood to expand every state.
  1730. bool oom = false;
  1731. while (!q.empty()) {
  1732. State* s = q.front();
  1733. q.pop_front();
  1734. for (int c : input) {
  1735. State* ns = RunStateOnByteUnlocked(s, c);
  1736. if (ns == NULL) {
  1737. oom = true;
  1738. break;
  1739. }
  1740. if (ns == DeadState) {
  1741. output[ByteMap(c)] = -1;
  1742. continue;
  1743. }
  1744. if (m.find(ns) == m.end()) {
  1745. m.emplace(ns, static_cast<int>(m.size()));
  1746. q.push_back(ns);
  1747. }
  1748. output[ByteMap(c)] = m[ns];
  1749. }
  1750. if (cb)
  1751. cb(oom ? NULL : output.data(),
  1752. s == FullMatchState || s->IsMatch());
  1753. if (oom)
  1754. break;
  1755. }
  1756. return static_cast<int>(m.size());
  1757. }
  1758. // Build out all states in DFA for kind. Returns number of states.
  1759. int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) {
  1760. return GetDFA(kind)->BuildAllStates(cb);
  1761. }
  1762. // Computes min and max for matching string.
  1763. // Won't return strings bigger than maxlen.
  1764. bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
  1765. if (!ok())
  1766. return false;
  1767. // NOTE: if future users of PossibleMatchRange want more precision when
  1768. // presented with infinitely repeated elements, consider making this a
  1769. // parameter to PossibleMatchRange.
  1770. static int kMaxEltRepetitions = 0;
  1771. // Keep track of the number of times we've visited states previously. We only
  1772. // revisit a given state if it's part of a repeated group, so if the value
  1773. // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set
  1774. // |*max| to |PrefixSuccessor(*max)|.
  1775. //
  1776. // Also note that previously_visited_states[UnseenStatePtr] will, in the STL
  1777. // tradition, implicitly insert a '0' value at first use. We take advantage
  1778. // of that property below.
  1779. std::unordered_map<State*, int> previously_visited_states;
  1780. // Pick out start state for anchored search at beginning of text.
  1781. RWLocker l(&cache_mutex_);
  1782. SearchParams params(StringPiece(), StringPiece(), &l);
  1783. params.anchored = true;
  1784. if (!AnalyzeSearch(&params))
  1785. return false;
  1786. if (params.start == DeadState) { // No matching strings
  1787. *min = "";
  1788. *max = "";
  1789. return true;
  1790. }
  1791. if (params.start == FullMatchState) // Every string matches: no max
  1792. return false;
  1793. // The DFA is essentially a big graph rooted at params.start,
  1794. // and paths in the graph correspond to accepted strings.
  1795. // Each node in the graph has potentially 256+1 arrows
  1796. // coming out, one for each byte plus the magic end of
  1797. // text character kByteEndText.
  1798. // To find the smallest possible prefix of an accepted
  1799. // string, we just walk the graph preferring to follow
  1800. // arrows with the lowest bytes possible. To find the
  1801. // largest possible prefix, we follow the largest bytes
  1802. // possible.
  1803. // The test for whether there is an arrow from s on byte j is
  1804. // ns = RunStateOnByteUnlocked(s, j);
  1805. // if (ns == NULL)
  1806. // return false;
  1807. // if (ns != DeadState && ns->ninst > 0)
  1808. // The RunStateOnByteUnlocked call asks the DFA to build out the graph.
  1809. // It returns NULL only if the DFA has run out of memory,
  1810. // in which case we can't be sure of anything.
  1811. // The second check sees whether there was graph built
  1812. // and whether it is interesting graph. Nodes might have
  1813. // ns->ninst == 0 if they exist only to represent the fact
  1814. // that a match was found on the previous byte.
  1815. // Build minimum prefix.
  1816. State* s = params.start;
  1817. min->clear();
  1818. MutexLock lock(&mutex_);
  1819. for (int i = 0; i < maxlen; i++) {
  1820. if (previously_visited_states[s] > kMaxEltRepetitions)
  1821. break;
  1822. previously_visited_states[s]++;
  1823. // Stop if min is a match.
  1824. State* ns = RunStateOnByte(s, kByteEndText);
  1825. if (ns == NULL) // DFA out of memory
  1826. return false;
  1827. if (ns != DeadState && (ns == FullMatchState || ns->IsMatch()))
  1828. break;
  1829. // Try to extend the string with low bytes.
  1830. bool extended = false;
  1831. for (int j = 0; j < 256; j++) {
  1832. ns = RunStateOnByte(s, j);
  1833. if (ns == NULL) // DFA out of memory
  1834. return false;
  1835. if (ns == FullMatchState ||
  1836. (ns > SpecialStateMax && ns->ninst_ > 0)) {
  1837. extended = true;
  1838. min->append(1, static_cast<char>(j));
  1839. s = ns;
  1840. break;
  1841. }
  1842. }
  1843. if (!extended)
  1844. break;
  1845. }
  1846. // Build maximum prefix.
  1847. previously_visited_states.clear();
  1848. s = params.start;
  1849. max->clear();
  1850. for (int i = 0; i < maxlen; i++) {
  1851. if (previously_visited_states[s] > kMaxEltRepetitions)
  1852. break;
  1853. previously_visited_states[s] += 1;
  1854. // Try to extend the string with high bytes.
  1855. bool extended = false;
  1856. for (int j = 255; j >= 0; j--) {
  1857. State* ns = RunStateOnByte(s, j);
  1858. if (ns == NULL)
  1859. return false;
  1860. if (ns == FullMatchState ||
  1861. (ns > SpecialStateMax && ns->ninst_ > 0)) {
  1862. extended = true;
  1863. max->append(1, static_cast<char>(j));
  1864. s = ns;
  1865. break;
  1866. }
  1867. }
  1868. if (!extended) {
  1869. // Done, no need for PrefixSuccessor.
  1870. return true;
  1871. }
  1872. }
  1873. // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
  1874. PrefixSuccessor(max);
  1875. // If there are no bytes left, we have no way to say "there is no maximum
  1876. // string". We could make the interface more complicated and be able to
  1877. // return "there is no maximum but here is a minimum", but that seems like
  1878. // overkill -- the most common no-max case is all possible strings, so not
  1879. // telling the caller that the empty string is the minimum match isn't a
  1880. // great loss.
  1881. if (max->empty())
  1882. return false;
  1883. return true;
  1884. }
  1885. // PossibleMatchRange for a Prog.
  1886. bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
  1887. // Have to use dfa_longest_ to get all strings for full matches.
  1888. // For example, (a|aa) never matches aa in first-match mode.
  1889. return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen);
  1890. }
  1891. } // namespace re2