StandardTokenizer.php 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. <?php
  2. /**
  3. * @copyright Copyright (c) 2014 Carsten Brandt
  4. * @license https://github.com/cebe/js-search/blob/master/LICENSE
  5. * @link https://github.com/cebe/js-search#readme
  6. */
  7. namespace cebe\jssearch\tokenizer;
  8. use cebe\jssearch\TokenizerInterface;
  9. /**
  10. * StandardTokenizer
  11. *
  12. * @author Carsten Brandt <mail@cebe.cc>
  13. */
  14. class StandardTokenizer implements TokenizerInterface
  15. {
  16. /**
  17. * @var array a list of stopwords to remove from the token list.
  18. */
  19. public $stopWords = [
  20. // default lucene http://stackoverflow.com/questions/17527741/what-is-the-default-list-of-stopwords-used-in-lucenes-stopfilter
  21. "a", "an", "and", "are", "as", "at", "be", "but", "by",
  22. "for", "if", "in", "into", "is", "it",
  23. "no", "not", "of", "on", "or", "such",
  24. "that", "the", "their", "then", "there", "these",
  25. "they", "this", "to", "was", "will", "with"
  26. ];
  27. /**
  28. * @var string a list of characters that should be used as word delimiters.
  29. */
  30. public $delimiters = '.,;:\\/[](){}';
  31. /**
  32. * Tokenizes a string and returns an array of the following format:
  33. *
  34. * ```
  35. * [['t' => 'word', 'w' => 2], ['t' => 'other', 'w' => 1]]
  36. * ```
  37. *
  38. * where the first part is the token string and the second is a weight value.
  39. *
  40. * Also removes [[stopWords]] from the list.
  41. *
  42. * @param string $string the string to tokenize
  43. * @return array
  44. */
  45. public function tokenize($string)
  46. {
  47. $delimiters = preg_quote($this->delimiters, '/');
  48. return array_map(function($token) {return ['t' => $token, 'w' => 1]; }, array_filter(
  49. array_map(function($t) { return mb_strtolower($t, 'UTF-8'); }, preg_split("/[\\s$delimiters]+/", $string, -1, PREG_SPLIT_NO_EMPTY)),
  50. function($word) {
  51. return !in_array($word, $this->stopWords);
  52. }
  53. ));
  54. }
  55. /**
  56. * Returns a javascript equivalent of [[tokenize]] that will be used
  57. * on client side to tokenize the search query.
  58. *
  59. * This is used to ensure the same tokenizer is used for building the index and for searching.
  60. *
  61. * @return string
  62. */
  63. public function tokenizeJs()
  64. {
  65. $delimiters = preg_quote($this->delimiters, '/');
  66. $stopWords = json_encode($this->stopWords);
  67. return <<<JS
  68. function(string) {
  69. var stopWords = $stopWords;
  70. return string.split(/[\s$delimiters]+/).map(function(val) {
  71. return val.toLowerCase();
  72. }).filter(function(val) {
  73. for (w in stopWords) {
  74. if (stopWords[w] == val) return false;
  75. }
  76. return true;
  77. }).map(function(word) {
  78. return {t: word, w: 1};
  79. });
  80. }
  81. JS;
  82. }
  83. }