Highlighter.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. <?php
  2. /* Copyright (c)
  3. * - 2006-2013, Ivan Sagalaev (maniac@softwaremaniacs.org), highlight.js
  4. * (original author)
  5. * - 2013-2015, Geert Bergman (geert@scrivo.nl), highlight.php
  6. * - 2014, Daniel Lynge, highlight.php (contributor)
  7. * All rights reserved.
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright notice,
  13. * this list of conditions and the following disclaimer.
  14. * 2. Redistributions in binary form must reproduce the above copyright notice,
  15. * this list of conditions and the following disclaimer in the documentation
  16. * and/or other materials provided with the distribution.
  17. * 3. Neither the name of "highlight.js", "highlight.php", nor the names of its
  18. * contributors may be used to endorse or promote products derived from this
  19. * software without specific prior written permission.
  20. *
  21. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31. * POSSIBILITY OF SUCH DAMAGE.
  32. */
  33. namespace Highlight;
  34. class Highlighter
  35. {
  36. private $modeBuffer = "";
  37. private $result = "";
  38. private $top = null;
  39. private $language = null;
  40. private $keywordCount = 0;
  41. private $relevance = 0;
  42. private $ignoreIllegals = false;
  43. private static $classMap = array();
  44. private static $languages = null;
  45. private static $aliases = null;
  46. private $tabReplace = null;
  47. private $classPrefix = "hljs-";
  48. private $autodetectSet = array(
  49. "xml", "json", "javascript", "css", "php", "http"
  50. );
  51. public function __construct()
  52. {
  53. $this->registerLanguages();
  54. }
  55. private function registerLanguages() {
  56. // Languages that take precedence in the classMap array.
  57. foreach(Array("xml", "django", "javascript", "matlab", "cpp") as $l) {
  58. $this->createLanguage($l);
  59. }
  60. $d = dir(__DIR__.DIRECTORY_SEPARATOR."languages");
  61. while (false !== ($entry = $d->read())) {
  62. if ($entry[0] !== ".") {
  63. $lng = substr($entry, 0, -5);
  64. $this->createLanguage($lng);
  65. }
  66. }
  67. $d->close();
  68. self::$languages = array_keys(self::$classMap);
  69. }
  70. private function createLanguage($languageId)
  71. {
  72. if (!isset(self::$classMap[$languageId])) {
  73. $lang = new Language($languageId);
  74. self::$classMap[$languageId] = $lang;
  75. if (isset($lang->mode->aliases)) {
  76. foreach ($lang->mode->aliases as $alias) {
  77. self::$aliases[$alias] = $languageId;
  78. }
  79. }
  80. }
  81. return self::$classMap[$languageId];
  82. }
  83. private function testRe($re, $lexeme)
  84. {
  85. if (!$re) {
  86. return false;
  87. }
  88. $test = preg_match($re, $lexeme, $match, PREG_OFFSET_CAPTURE);
  89. if ($test === false) {
  90. throw new \Exception("Invalid regexp: " .
  91. var_export($re, true));
  92. }
  93. return count($match) && ($match[0][1] == 0);
  94. }
  95. private function subMode($lexeme, $mode)
  96. {
  97. for ($i=0; $i<count($mode->contains); $i++) {
  98. if ($this->testRe($mode->contains[$i]->beginRe, $lexeme)) {
  99. return $mode->contains[$i];
  100. }
  101. }
  102. }
  103. private function endOfMode($mode, $lexeme)
  104. {
  105. if ($this->testRe($mode->endRe, $lexeme)) {
  106. while ($mode->endsParent && $mode->parent) {
  107. $mode = $mode->parent;
  108. }
  109. return $mode;
  110. }
  111. if ($mode->endsWithParent) {
  112. return $this->endOfMode($mode->parent, $lexeme);
  113. }
  114. }
  115. private function isIllegal($lexeme, $mode)
  116. {
  117. return
  118. !$this->ignoreIllegals && $this->testRe($mode->illegalRe, $lexeme);
  119. }
  120. private function keywordMatch($mode, $match)
  121. {
  122. $kwd = $this->language->caseInsensitive
  123. ? mb_strtolower($match[0], "UTF-8") : $match[0];
  124. return isset($mode->keywords[$kwd]) ? $mode->keywords[$kwd] : null;
  125. }
  126. private function buildSpan(
  127. $classname, $insideSpan, $leaveOpen=false, $noPrefix=false)
  128. {
  129. $classPrefix = $noPrefix ? "" : $this->classPrefix;
  130. $openSpan = "<span class=\"" . $classPrefix;
  131. $closeSpan = $leaveOpen ? "" : "</span>";
  132. $openSpan .= $classname . "\">";
  133. return $openSpan . $insideSpan . $closeSpan;
  134. }
  135. private function escape($value) {
  136. return htmlspecialchars($value, ENT_NOQUOTES);
  137. }
  138. private function processKeywords()
  139. {
  140. if (empty($this->top->keywords)) {
  141. return $this->escape($this->modeBuffer);
  142. }
  143. $result = "";
  144. $lastIndex = 0;
  145. /* TODO: when using the crystal language file on django and twigs code
  146. * the values of $this->top->lexemesRe can become "" (empty). Check
  147. * if this behaviour is consistent with highlight.js.
  148. */
  149. if ($this->top->lexemesRe) {
  150. while (preg_match($this->top->lexemesRe, $this->modeBuffer, $match,
  151. PREG_OFFSET_CAPTURE, $lastIndex)) {
  152. $result .= $this->escape(substr(
  153. $this->modeBuffer, $lastIndex, $match[0][1] - $lastIndex));
  154. $keyword_match = $this->keywordMatch($this->top, $match[0]);
  155. if ($keyword_match) {
  156. $this->relevance += $keyword_match[1];
  157. $result .= $this->buildSpan(
  158. $keyword_match[0], $this->escape($match[0][0]));
  159. } else {
  160. $result .= $this->escape($match[0][0]);
  161. }
  162. $lastIndex = strlen($match[0][0]) + $match[0][1];
  163. }
  164. }
  165. return $result . $this->escape(substr($this->modeBuffer, $lastIndex));
  166. }
  167. private function processSubLanguage()
  168. {
  169. try {
  170. $hl = new Highlighter();
  171. $hl->autodetectSet = $this->autodetectSet;
  172. $explicit = is_string($this->top->subLanguage);
  173. if ($explicit && !isset(
  174. array_flip(self::$languages)[$this->top->subLanguage])) {
  175. return $this->escape($this->modeBuffer);
  176. }
  177. if ($explicit) {
  178. $res = $hl->highlight($this->top->subLanguage,
  179. $this->modeBuffer, true,
  180. isset($this->continuations[$this->top->subLanguage])
  181. ? $this->continuations[$this->top->subLanguage] : null);
  182. } else {
  183. $res = $hl->highlightAuto($this->modeBuffer,
  184. count($this->top->subLanguage) ? $this->top->subLanguage
  185. : null);
  186. }
  187. // Counting embedded language score towards the host language may
  188. // be disabled with zeroing the containing mode relevance. Usecase
  189. // in point is Markdown that allows XML everywhere and makes every
  190. // XML snippet to have a much larger Markdown score.
  191. if ($this->top->relevance > 0) {
  192. $this->relevance += $res->relevance;
  193. }
  194. if ($explicit) {
  195. $this->continuations[$this->top->subLanguage] = $res->top;
  196. }
  197. return $this->buildSpan($res->language, $res->value, false, true);
  198. } catch (\Exception $e) {
  199. error_log("TODO, is this a relevant catch?");
  200. error_log($e);
  201. return $this->escape($this->modeBuffer);
  202. }
  203. }
  204. private function processBuffer()
  205. {
  206. return !is_null($this->top->subLanguage)
  207. ? $this->processSubLanguage() : $this->processKeywords();
  208. }
  209. private function startNewMode($mode, $lexeme)
  210. {
  211. $markup = $mode->className
  212. ? $this->buildSpan($mode->className, "", true) : "";
  213. if ($mode->returnBegin) {
  214. $this->result .= $markup;
  215. $this->modeBuffer = "";
  216. } elseif ($mode->excludeBegin) {
  217. $this->result .= $this->escape($lexeme) . $markup;
  218. $this->modeBuffer = "";
  219. } else {
  220. $this->result .= $markup;
  221. $this->modeBuffer = $lexeme;
  222. }
  223. $t = clone $mode;
  224. $t->parent = $this->top;
  225. $this->top = $t;
  226. }
  227. private function processLexeme($buffer, $lexeme=null)
  228. {
  229. $this->modeBuffer .= $buffer;
  230. if (null === $lexeme) {
  231. $this->result .= $this->processBuffer();
  232. return 0;
  233. }
  234. $new_mode = $this->subMode($lexeme, $this->top);
  235. if ($new_mode) {
  236. $this->result .= $this->processBuffer();
  237. $this->startNewMode($new_mode, $lexeme);
  238. return $new_mode->returnBegin ? 0 : strlen($lexeme);
  239. }
  240. $end_mode = $this->endOfMode($this->top, $lexeme);
  241. if ($end_mode) {
  242. $origin = $this->top;
  243. if (!($origin->returnEnd || $origin->excludeEnd)) {
  244. $this->modeBuffer .= $lexeme;
  245. }
  246. $this->result .= $this->processBuffer();
  247. do {
  248. if ($this->top->className) {
  249. $this->result .= "</span>";
  250. }
  251. $this->relevance += $this->top->relevance;
  252. $this->top = $this->top->parent;
  253. } while ($this->top != $end_mode->parent);
  254. if ($origin->excludeEnd) {
  255. $this->result .= $this->escape($lexeme);
  256. }
  257. $this->modeBuffer = "";
  258. if ($end_mode->starts) {
  259. $this->startNewMode($end_mode->starts, "");
  260. }
  261. return $origin->returnEnd ? 0 : strlen($lexeme);
  262. }
  263. if ($this->isIllegal($lexeme, $this->top)) {
  264. $className = $this->top->className
  265. ? $this->top->className : "unnamed";
  266. $err = "Illegal lexeme \"{$lexeme}\" for mode \"{$className}\"";
  267. throw new \Exception($err);
  268. }
  269. // Parser should not reach this point as all types of lexemes should
  270. // be caught earlier, but if it does due to some bug make sure it
  271. // advances at least one character forward to prevent infinite looping.
  272. $this->modeBuffer .= $lexeme;
  273. $l = strlen($lexeme);
  274. return $l ? $l : 1;
  275. }
  276. /**
  277. * Replace tabs for something more usable.
  278. */
  279. private function replaceTabs($code) {
  280. if ($this->tabReplace !== null) {
  281. return str_replace("\t", $this->tabReplace, $code);
  282. }
  283. return $code;
  284. }
  285. /**
  286. * Set the set of languages used for autodetection. When using
  287. * autodetection the code to highlight will be probed for every language
  288. * in this set. Limiting this set to only the languages you want to use
  289. * will greatly improve highlighting speed.
  290. *
  291. * @param array $set
  292. * An array of language games to use for autodetection. This defaults
  293. * to a typical set Web development languages.
  294. */
  295. public function setAutodetectLanguages(array $set)
  296. {
  297. $this->autodetectSet = array_unique($set);
  298. $this->registerLanguages();
  299. }
  300. /**
  301. * Get the tab replacement string.
  302. *
  303. * @return string
  304. * The tab replacement string.
  305. */
  306. public function getTabReplace()
  307. {
  308. return $this->tabReplace;
  309. }
  310. /**
  311. * Set the tab replacement string. This defaults to NULL: no tabs
  312. * will be replaced.
  313. *
  314. * @param string $tabReplace
  315. * The tab replacement string.
  316. */
  317. public function setTabReplace($tabReplace)
  318. {
  319. $this->tabReplace = $tabReplace;
  320. }
  321. /**
  322. * @throws
  323. * A DomainException if the requested language was not in this
  324. * Highlighter's language set.
  325. */
  326. private function getLanguage($name) {
  327. if (isset(self::$classMap[$name])) {
  328. return self::$classMap[$name];
  329. } elseif (isset(self::$aliases[$name]) &&
  330. isset(self::$classMap[self::$aliases[$name]])) {
  331. return self::$classMap[self::$aliases[$name]];
  332. }
  333. throw new \DomainException("Unknown language: $name");
  334. }
  335. /**
  336. * Core highlighting function. Accepts a language name, or an alias, and a
  337. * string with the code to highlight. Returns an object with the following
  338. * properties:
  339. * - relevance (int)
  340. * - value (an HTML string with highlighting markup)
  341. * @throws
  342. * A DomainException if the requested language was not in this
  343. * Highlighter's language set.
  344. */
  345. public function highlight(
  346. $language, $code, $ignoreIllegals=true, $continuation=null)
  347. {
  348. $this->language = $this->getLanguage($language);
  349. $this->language->compile();
  350. $this->top = $continuation ? $continuation : $this->language->mode;
  351. $this->continuations = array();
  352. $this->result = "";
  353. for ($current = $this->top; $current != $this->language->mode;
  354. $current = $current->parent) {
  355. if ($current->className) {
  356. $this->result =
  357. $this->buildSpan($current->className, '', true) .
  358. $this->result;
  359. }
  360. }
  361. $this->modeBuffer = "";
  362. $this->relevance = 0;
  363. $this->ignoreIllegals = $ignoreIllegals;
  364. $res = new \stdClass;
  365. $res->relevance = 0;
  366. $res->value = "";
  367. $res->language = "";
  368. try {
  369. $match = null;
  370. $count = 0;
  371. $index = 0;
  372. while ($this->top && $this->top->terminators) {
  373. $test = preg_match($this->top->terminators, $code, $match,
  374. PREG_OFFSET_CAPTURE, $index);
  375. if ($test === false) {
  376. throw new \Exception("Invalid regExp ".
  377. var_export($this->top->terminators, true));
  378. } else if ($test === 0) {
  379. break;
  380. }
  381. $count = $this->processLexeme(
  382. substr($code, $index, $match[0][1] - $index), $match[0][0]);
  383. $index = $match[0][1] + $count;
  384. }
  385. $this->processLexeme(substr($code, $index));
  386. for ($current = $this->top; $current != $this->language->mode;
  387. $current = $current->parent) {
  388. if ($current->className) {
  389. $this->result .= "</span>";
  390. }
  391. }
  392. $res->relevance = $this->relevance;
  393. $res->value = $this->replaceTabs($this->result);
  394. $res->language = $this->language->name;
  395. $res->top = $this->top;
  396. return $res;
  397. } catch (\Exception $e) {
  398. if (strpos($e->getMessage(), "Illegal") !== false) {
  399. $res->value = $this->escape($code);
  400. return $res;
  401. } else {
  402. throw $e;
  403. }
  404. }
  405. }
  406. public function highlightAuto($code, $languageSubset = null)
  407. {
  408. $res = new \stdClass;
  409. $res->relevance = 0;
  410. $res->value = $this->escape($code);
  411. $res->language = "";
  412. $scnd = clone $res;
  413. $tmp = $languageSubset ? $languageSubset : $this->autodetectSet;
  414. foreach ($tmp as $l) {
  415. $current = $this->highlight($l, $code, false);
  416. if ($current->relevance > $scnd->relevance) {
  417. $scnd = $current;
  418. }
  419. if ($current->relevance > $res->relevance) {
  420. $scnd = $res;
  421. $res = $current;
  422. }
  423. }
  424. if ($scnd->language) {
  425. $res->secondBest = $scnd;
  426. }
  427. return $res;
  428. }
  429. /**
  430. * Return a list of all supported languages. Using this list in
  431. * setAutodetectLanguages will turn on autodetection for all supported
  432. * languages.
  433. *
  434. * @return array
  435. * An array of language names (strings).
  436. */
  437. public function listLanguages()
  438. {
  439. return self::$languages;
  440. }
  441. }