| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 | <?php/** * @copyright Copyright (c) 2014 Carsten Brandt * @license https://github.com/cebe/js-search/blob/master/LICENSE * @link https://github.com/cebe/js-search#readme */namespace cebe\jssearch\analyzer;use cebe\jssearch\AnalyzerInterface;use cebe\jssearch\TokenizerInterface;/** * Analyzer for HTML files * * @author Carsten Brandt <mail@cebe.cc> */class HtmlAnalyzer implements AnalyzerInterface{	public $headWeight = 20;	public $titleWeight = 4;	public $textWeight = 1.2;	/**	 * @inheritDoc	 */	public function analyze($string, TokenizerInterface $tokenizer)	{		$index = array_merge(			$this->findText($string, '~<h(\d)>(.*?)</h\1>~s',   ['text' => 2, 'weight' => 1], $tokenizer, function($w, $h) { return $w * ($this->headWeight - $h) / 10; }),			$this->findText($string, '~<title>(.*?)</title>~s', ['text' => 1], $tokenizer, $this->titleWeight),			$this->findText($string, '~<p>(.*?|(?R))</p>~s',    ['text' => 1], $tokenizer, $this->textWeight),			$this->findText($string, '~<(th|td|li|dd|dt)>(.*?)</\1>~s', ['text' => 2], $tokenizer, $this->textWeight)		);		$wordCount = array_reduce($index, function($carry, $item) { return $carry + count($item); }, 0);		foreach($index as $i => $words) {			foreach($words as $w => $word) {//				$index[$i][$w]['w'] = 1 + $index[$i][$w]['w'] / $wordCount; // TODO improve weight formula here			}		}		return $index;	}	/**	 * @param $string	 * @param $pattern	 * @param $selectors	 * @param TokenizerInterface $tokenizer	 */	private function findText($string, $pattern, $selectors, $tokenizer, $weight)	{		$index = [];		preg_match_all($pattern, $string, $matches);		foreach($matches[0] as $i => $match) {			$index[] = array_map(				function($token) use ($weight, $matches, $selectors, $i) {					if ($weight instanceof \Closure) {						$w = call_user_func_array($weight, [$token['w'], $matches[$selectors['weight']][$i]]);					} else {						$w = $token['w'] * $weight;					}					return ['t' => $token['t'], 'w' => $w];				},				$tokenizer->tokenize(strip_tags($matches[$selectors['text']][$i]))			);		}		return $index;	}}
 |