123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- <?php
- /**
- * @copyright Copyright (c) 2014 Carsten Brandt
- * @license https://github.com/cebe/js-search/blob/master/LICENSE
- * @link https://github.com/cebe/js-search#readme
- */
- namespace cebe\jssearch\tokenizer;
- use cebe\jssearch\TokenizerInterface;
- /**
- * StandardTokenizer
- *
- * @author Carsten Brandt <mail@cebe.cc>
- */
- class StandardTokenizer implements TokenizerInterface
- {
- /**
- * @var array a list of stopwords to remove from the token list.
- */
- public $stopWords = [
- // default lucene http://stackoverflow.com/questions/17527741/what-is-the-default-list-of-stopwords-used-in-lucenes-stopfilter
- "a", "an", "and", "are", "as", "at", "be", "but", "by",
- "for", "if", "in", "into", "is", "it",
- "no", "not", "of", "on", "or", "such",
- "that", "the", "their", "then", "there", "these",
- "they", "this", "to", "was", "will", "with"
- ];
- /**
- * @var string a list of characters that should be used as word delimiters.
- */
- public $delimiters = '.,;:\\/[](){}';
- /**
- * Tokenizes a string and returns an array of the following format:
- *
- * ```
- * [['t' => 'word', 'w' => 2], ['t' => 'other', 'w' => 1]]
- * ```
- *
- * where the first part is the token string and the second is a weight value.
- *
- * Also removes [[stopWords]] from the list.
- *
- * @param string $string the string to tokenize
- * @return array
- */
- public function tokenize($string)
- {
- $delimiters = preg_quote($this->delimiters, '/');
- return array_map(function($token) {return ['t' => $token, 'w' => 1]; }, array_filter(
- array_map(function($t) { return mb_strtolower($t, 'UTF-8'); }, preg_split("/[\\s$delimiters]+/", $string, -1, PREG_SPLIT_NO_EMPTY)),
- function($word) {
- return !in_array($word, $this->stopWords);
- }
- ));
- }
- /**
- * Returns a javascript equivalent of [[tokenize]] that will be used
- * on client side to tokenize the search query.
- *
- * This is used to ensure the same tokenizer is used for building the index and for searching.
- *
- * @return string
- */
- public function tokenizeJs()
- {
- $delimiters = preg_quote($this->delimiters, '/');
- $stopWords = json_encode($this->stopWords);
- return <<<JS
- function(string) {
- var stopWords = $stopWords;
- return string.split(/[\s$delimiters]+/).map(function(val) {
- return val.toLowerCase();
- }).filter(function(val) {
- for (w in stopWords) {
- if (stopWords[w] == val) return false;
- }
- return true;
- }).map(function(word) {
- return {t: word, w: 1};
- });
- }
- JS;
- }
- }
|