Indexer.php 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. <?php
  2. /**
  3. * @copyright Copyright (c) 2014 Carsten Brandt
  4. * @license https://github.com/cebe/js-search/blob/master/LICENSE
  5. * @link https://github.com/cebe/js-search#readme
  6. */
  7. namespace cebe\jssearch;
  8. use cebe\jssearch\analyzer\HtmlAnalyzer;
  9. use cebe\jssearch\tokenizer\StandardTokenizer;
  10. /**
  11. * @author Carsten Brandt <mail@cebe.cc>
  12. */
  13. class Indexer
  14. {
  15. public $index = [];
  16. public $files = [];
  17. public function indexFiles($files, $basePath, $baseUrl = './')
  18. {
  19. $fi = count($this->files);
  20. foreach($files as $file) {
  21. $fi++;
  22. $contents = file_get_contents($file);
  23. // create file entry
  24. $this->files[$fi] = $this->generateFileInfo($file, $contents, $basePath, $baseUrl);
  25. // analyze file
  26. foreach($this->getAnalyzer()->analyze($contents, $this->getTokenizer()) as $index) {
  27. foreach($index as $word) {
  28. // $word['t'] - the token
  29. // $word['w'] - the weight
  30. if (isset($this->index[$word['t']][$fi])) {
  31. $this->index[$word['t']][$fi]['w'] *= $word['w'];
  32. } else {
  33. $this->index[$word['t']][$fi] = [
  34. 'f' => $fi,
  35. 'w' => $word['w'],
  36. ];
  37. }
  38. }
  39. }
  40. }
  41. // reset array indexes for files to create correct json arrays
  42. foreach($this->index as $word => $files) {
  43. $this->index[$word] = array_values($files);
  44. }
  45. }
  46. protected function generateFileInfo($file, $contents, $basePath, $baseUrl)
  47. {
  48. // create file entry
  49. if (preg_match('~<h1>(.*?)</h1>~s', $contents, $matches)) {
  50. $title = strip_tags($matches[1]);
  51. } elseif (preg_match('~<title>(.*?)</title>~s', $contents, $matches)) {
  52. $title = strip_tags($matches[1]);
  53. } else {
  54. $title = '<i>No title</i>';
  55. }
  56. return [
  57. 'url' => $baseUrl . str_replace('\\', '/', substr($file, strlen(rtrim($basePath, '\\/')))),
  58. 'title' => $title,
  59. ];
  60. }
  61. public function exportJs()
  62. {
  63. $index = json_encode($this->index);
  64. $files = json_encode($this->files);
  65. $tokenizeString = $this->getTokenizer()->tokenizeJs();
  66. return <<<JS
  67. jssearch.index = $index;
  68. jssearch.files = $files;
  69. jssearch.tokenizeString = $tokenizeString;
  70. JS;
  71. }
  72. private $_tokenizer;
  73. /**
  74. * @return TokenizerInterface
  75. */
  76. public function getTokenizer()
  77. {
  78. if ($this->_tokenizer === null) {
  79. $this->_tokenizer = new StandardTokenizer();
  80. }
  81. return $this->_tokenizer;
  82. }
  83. /**
  84. * @param TokenizerInterface $tokenizer
  85. */
  86. public function setTokenizer($tokenizer)
  87. {
  88. $this->_tokenizer = $tokenizer;
  89. }
  90. private $_analyzer;
  91. /**
  92. * @return AnalyzerInterface
  93. */
  94. public function getAnalyzer()
  95. {
  96. if ($this->_analyzer === null) {
  97. $this->_analyzer = new HtmlAnalyzer();
  98. }
  99. return $this->_analyzer;
  100. }
  101. /**
  102. * @param AnalyzerInterface $analyzer
  103. */
  104. public function setAnalyzer($analyzer)
  105. {
  106. $this->_analyzer = $analyzer;
  107. }
  108. }