12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- <?php
- /**
- * @copyright Copyright (c) 2014 Carsten Brandt
- * @license https://github.com/cebe/js-search/blob/master/LICENSE
- * @link https://github.com/cebe/js-search#readme
- */
- namespace cebe\jssearch\analyzer;
- use cebe\jssearch\AnalyzerInterface;
- use cebe\jssearch\TokenizerInterface;
- /**
- * Analyzer for HTML files
- *
- * @author Carsten Brandt <mail@cebe.cc>
- */
- class HtmlAnalyzer implements AnalyzerInterface
- {
- public $headWeight = 20;
- public $titleWeight = 4;
- public $textWeight = 1.2;
- /**
- * @inheritDoc
- */
- public function analyze($string, TokenizerInterface $tokenizer)
- {
- $index = array_merge(
- $this->findText($string, '~<h(\d)>(.*?)</h\1>~s', ['text' => 2, 'weight' => 1], $tokenizer, function($w, $h) { return $w * ($this->headWeight - $h) / 10; }),
- $this->findText($string, '~<title>(.*?)</title>~s', ['text' => 1], $tokenizer, $this->titleWeight),
- $this->findText($string, '~<p>(.*?|(?R))</p>~s', ['text' => 1], $tokenizer, $this->textWeight),
- $this->findText($string, '~<(th|td|li|dd|dt)>(.*?)</\1>~s', ['text' => 2], $tokenizer, $this->textWeight)
- );
- $wordCount = array_reduce($index, function($carry, $item) { return $carry + count($item); }, 0);
- foreach($index as $i => $words) {
- foreach($words as $w => $word) {
- // $index[$i][$w]['w'] = 1 + $index[$i][$w]['w'] / $wordCount; // TODO improve weight formula here
- }
- }
- return $index;
- }
- /**
- * @param $string
- * @param $pattern
- * @param $selectors
- * @param TokenizerInterface $tokenizer
- */
- private function findText($string, $pattern, $selectors, $tokenizer, $weight)
- {
- $index = [];
- preg_match_all($pattern, $string, $matches);
- foreach($matches[0] as $i => $match) {
- $index[] = array_map(
- function($token) use ($weight, $matches, $selectors, $i) {
- if ($weight instanceof \Closure) {
- $w = call_user_func_array($weight, [$token['w'], $matches[$selectors['weight']][$i]]);
- } else {
- $w = $token['w'] * $weight;
- }
- return ['t' => $token['t'], 'w' => $w];
- },
- $tokenizer->tokenize(strip_tags($matches[$selectors['text']][$i]))
- );
- }
- return $index;
- }
- }
|