123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- <?php
- /**
- * @copyright Copyright (c) 2014 Carsten Brandt
- * @license https://github.com/cebe/js-search/blob/master/LICENSE
- * @link https://github.com/cebe/js-search#readme
- */
- namespace cebe\jssearch;
- use cebe\jssearch\analyzer\HtmlAnalyzer;
- use cebe\jssearch\tokenizer\StandardTokenizer;
- /**
- * @author Carsten Brandt <mail@cebe.cc>
- */
- class Indexer
- {
- public $index = [];
- public $files = [];
- public function indexFiles($files, $basePath, $baseUrl = './')
- {
- $fi = count($this->files);
- foreach($files as $file) {
- $fi++;
- $contents = file_get_contents($file);
- // create file entry
- $this->files[$fi] = $this->generateFileInfo($file, $contents, $basePath, $baseUrl);
- // analyze file
- foreach($this->getAnalyzer()->analyze($contents, $this->getTokenizer()) as $index) {
- foreach($index as $word) {
- // $word['t'] - the token
- // $word['w'] - the weight
- if (isset($this->index[$word['t']][$fi])) {
- $this->index[$word['t']][$fi]['w'] *= $word['w'];
- } else {
- $this->index[$word['t']][$fi] = [
- 'f' => $fi,
- 'w' => $word['w'],
- ];
- }
- }
- }
- }
- // reset array indexes for files to create correct json arrays
- foreach($this->index as $word => $files) {
- $this->index[$word] = array_values($files);
- }
- }
- protected function generateFileInfo($file, $contents, $basePath, $baseUrl)
- {
- // create file entry
- if (preg_match('~<h1>(.*?)</h1>~s', $contents, $matches)) {
- $title = strip_tags($matches[1]);
- } elseif (preg_match('~<title>(.*?)</title>~s', $contents, $matches)) {
- $title = strip_tags($matches[1]);
- } else {
- $title = '<i>No title</i>';
- }
- return [
- 'url' => $baseUrl . str_replace('\\', '/', substr($file, strlen(rtrim($basePath, '\\/')))),
- 'title' => $title,
- ];
- }
- public function exportJs()
- {
- $index = json_encode($this->index);
- $files = json_encode($this->files);
- $tokenizeString = $this->getTokenizer()->tokenizeJs();
- return <<<JS
- jssearch.index = $index;
- jssearch.files = $files;
- jssearch.tokenizeString = $tokenizeString;
- JS;
- }
- private $_tokenizer;
- /**
- * @return TokenizerInterface
- */
- public function getTokenizer()
- {
- if ($this->_tokenizer === null) {
- $this->_tokenizer = new StandardTokenizer();
- }
- return $this->_tokenizer;
- }
- /**
- * @param TokenizerInterface $tokenizer
- */
- public function setTokenizer($tokenizer)
- {
- $this->_tokenizer = $tokenizer;
- }
- private $_analyzer;
- /**
- * @return AnalyzerInterface
- */
- public function getAnalyzer()
- {
- if ($this->_analyzer === null) {
- $this->_analyzer = new HtmlAnalyzer();
- }
- return $this->_analyzer;
- }
- /**
- * @param AnalyzerInterface $analyzer
- */
- public function setAnalyzer($analyzer)
- {
- $this->_analyzer = $analyzer;
- }
- }
|