| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 | 
							- <?php
 
- /**
 
-  * @copyright Copyright (c) 2014 Carsten Brandt
 
-  * @license https://github.com/cebe/js-search/blob/master/LICENSE
 
-  * @link https://github.com/cebe/js-search#readme
 
-  */
 
- namespace cebe\jssearch\tokenizer;
 
- use cebe\jssearch\TokenizerInterface;
 
- /**
 
-  * StandardTokenizer
 
-  *
 
-  * @author Carsten Brandt <mail@cebe.cc>
 
-  */
 
- class StandardTokenizer implements TokenizerInterface
 
- {
 
- 	/**
 
- 	 * @var array a list of stopwords to remove from the token list.
 
- 	 */
 
- 	public $stopWords = [
 
- 		// default lucene http://stackoverflow.com/questions/17527741/what-is-the-default-list-of-stopwords-used-in-lucenes-stopfilter
 
- 		"a", "an", "and", "are", "as", "at", "be", "but", "by",
 
- 		"for", "if", "in", "into", "is", "it",
 
- 		"no", "not", "of", "on", "or", "such",
 
- 		"that", "the", "their", "then", "there", "these",
 
- 		"they", "this", "to", "was", "will", "with"
 
- 	];
 
- 	/**
 
- 	 * @var string a list of characters that should be used as word delimiters.
 
- 	 */
 
- 	public $delimiters = '.,;:\\/[](){}';
 
- 	/**
 
- 	 * Tokenizes a string and returns an array of the following format:
 
- 	 *
 
- 	 * ```
 
- 	 * [['t' => 'word', 'w' => 2], ['t' => 'other', 'w' => 1]]
 
- 	 * ```
 
- 	 *
 
- 	 * where the first part is the token string and the second is a weight value.
 
- 	 *
 
- 	 * Also removes [[stopWords]] from the list.
 
- 	 *
 
- 	 * @param string $string the string to tokenize
 
- 	 * @return array
 
- 	 */
 
- 	public function tokenize($string)
 
- 	{
 
- 		$delimiters = preg_quote($this->delimiters, '/');
 
- 		return array_map(function($token) {return ['t' => $token, 'w' => 1]; }, array_filter(
 
- 			array_map(function($t) { return mb_strtolower($t, 'UTF-8'); }, preg_split("/[\\s$delimiters]+/", $string, -1, PREG_SPLIT_NO_EMPTY)),
 
- 			function($word) {
 
- 				return !in_array($word, $this->stopWords);
 
- 			}
 
- 		));
 
- 	}
 
- 	/**
 
- 	 * Returns a javascript equivalent of [[tokenize]] that will be used
 
- 	 * on client side to tokenize the search query.
 
- 	 *
 
- 	 * This is used to ensure the same tokenizer is used for building the index and for searching.
 
- 	 *
 
- 	 * @return string
 
- 	 */
 
- 	public function tokenizeJs()
 
- 	{
 
- 		$delimiters = preg_quote($this->delimiters, '/');
 
- 		$stopWords = json_encode($this->stopWords);
 
- 		return <<<JS
 
- function(string) {
 
- 		var stopWords = $stopWords;
 
- 		return string.split(/[\s$delimiters]+/).map(function(val) {
 
- 			return val.toLowerCase();
 
- 		}).filter(function(val) {
 
- 			for (w in stopWords) {
 
- 				if (stopWords[w] == val) return false;
 
- 			}
 
- 			return true;
 
- 		}).map(function(word) {
 
- 			return {t: word, w: 1};
 
- 		});
 
- }
 
- JS;
 
- 	}
 
- }
 
 
  |