chengwenliang
/
wighair


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
							<?php
/**
 * @copyright Copyright (c) 2014 Carsten Brandt
 * @license https://github.com/cebe/js-search/blob/master/LICENSE
 * @link https://github.com/cebe/js-search#readme
 */

namespace cebe\jssearch\analyzer;

use cebe\jssearch\AnalyzerInterface;
use cebe\jssearch\TokenizerInterface;

/**
 * Analyzer for HTML files
 *
 * @author Carsten Brandt <mail@cebe.cc>
 */
class HtmlAnalyzer implements AnalyzerInterface
{
	public $headWeight = 20;
	public $titleWeight = 4;
	public $textWeight = 1.2;

	/**
	 * @inheritDoc
	 */
	public function analyze($string, TokenizerInterface $tokenizer)
	{
		$index = array_merge(
			$this->findText($string, '~<h(\d)>(.*?)</h\1>~s',   ['text' => 2, 'weight' => 1], $tokenizer, function($w, $h) { return $w * ($this->headWeight - $h) / 10; }),
			$this->findText($string, '~<title>(.*?)</title>~s', ['text' => 1], $tokenizer, $this->titleWeight),
			$this->findText($string, '~<p>(.*?|(?R))</p>~s',    ['text' => 1], $tokenizer, $this->textWeight),
			$this->findText($string, '~<(th|td|li|dd|dt)>(.*?)</\1>~s', ['text' => 2], $tokenizer, $this->textWeight)
		);

		$wordCount = array_reduce($index, function($carry, $item) { return $carry + count($item); }, 0);
		foreach($index as $i => $words) {
			foreach($words as $w => $word) {
//				$index[$i][$w]['w'] = 1 + $index[$i][$w]['w'] / $wordCount; // TODO improve weight formula here
			}
		}
		return $index;
	}

	/**
	 * @param $string
	 * @param $pattern
	 * @param $selectors
	 * @param TokenizerInterface $tokenizer
	 */
	private function findText($string, $pattern, $selectors, $tokenizer, $weight)
	{
		$index = [];
		preg_match_all($pattern, $string, $matches);
		foreach($matches[0] as $i => $match) {
			$index[] = array_map(
				function($token) use ($weight, $matches, $selectors, $i) {
					if ($weight instanceof \Closure) {
						$w = call_user_func_array($weight, [$token['w'], $matches[$selectors['weight']][$i]]);
					} else {
						$w = $token['w'] * $weight;
					}
					return ['t' => $token['t'], 'w' => $w];
				},
				$tokenizer->tokenize(strip_tags($matches[$selectors['text']][$i]))
			);
		}
		return $index;
	}
}