HtmlAnalyzer.php 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. <?php
  2. /**
  3. * @copyright Copyright (c) 2014 Carsten Brandt
  4. * @license https://github.com/cebe/js-search/blob/master/LICENSE
  5. * @link https://github.com/cebe/js-search#readme
  6. */
  7. namespace cebe\jssearch\analyzer;
  8. use cebe\jssearch\AnalyzerInterface;
  9. use cebe\jssearch\TokenizerInterface;
  10. /**
  11. * Analyzer for HTML files
  12. *
  13. * @author Carsten Brandt <mail@cebe.cc>
  14. */
  15. class HtmlAnalyzer implements AnalyzerInterface
  16. {
  17. public $headWeight = 20;
  18. public $titleWeight = 4;
  19. public $textWeight = 1.2;
  20. /**
  21. * @inheritDoc
  22. */
  23. public function analyze($string, TokenizerInterface $tokenizer)
  24. {
  25. $index = array_merge(
  26. $this->findText($string, '~<h(\d)>(.*?)</h\1>~s', ['text' => 2, 'weight' => 1], $tokenizer, function($w, $h) { return $w * ($this->headWeight - $h) / 10; }),
  27. $this->findText($string, '~<title>(.*?)</title>~s', ['text' => 1], $tokenizer, $this->titleWeight),
  28. $this->findText($string, '~<p>(.*?|(?R))</p>~s', ['text' => 1], $tokenizer, $this->textWeight),
  29. $this->findText($string, '~<(th|td|li|dd|dt)>(.*?)</\1>~s', ['text' => 2], $tokenizer, $this->textWeight)
  30. );
  31. $wordCount = array_reduce($index, function($carry, $item) { return $carry + count($item); }, 0);
  32. foreach($index as $i => $words) {
  33. foreach($words as $w => $word) {
  34. // $index[$i][$w]['w'] = 1 + $index[$i][$w]['w'] / $wordCount; // TODO improve weight formula here
  35. }
  36. }
  37. return $index;
  38. }
  39. /**
  40. * @param $string
  41. * @param $pattern
  42. * @param $selectors
  43. * @param TokenizerInterface $tokenizer
  44. */
  45. private function findText($string, $pattern, $selectors, $tokenizer, $weight)
  46. {
  47. $index = [];
  48. preg_match_all($pattern, $string, $matches);
  49. foreach($matches[0] as $i => $match) {
  50. $index[] = array_map(
  51. function($token) use ($weight, $matches, $selectors, $i) {
  52. if ($weight instanceof \Closure) {
  53. $w = call_user_func_array($weight, [$token['w'], $matches[$selectors['weight']][$i]]);
  54. } else {
  55. $w = $token['w'] * $weight;
  56. }
  57. return ['t' => $token['t'], 'w' => $w];
  58. },
  59. $tokenizer->tokenize(strip_tags($matches[$selectors['text']][$i]))
  60. );
  61. }
  62. return $index;
  63. }
  64. }