XSIndex.class.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
  1. <?php
  2. /**
  3. * XSIndex 类定义文件
  4. *
  5. * @author hightman
  6. * @link http://www.xunsearch.com/
  7. * @copyright Copyright &copy; 2011 HangZhou YunSheng Network Technology Co., Ltd.
  8. * @license http://www.xunsearch.com/license/
  9. * @version $Id$
  10. */
  11. /**
  12. * XS 索引管理
  13. * 添加/删除/修改索引数据
  14. *
  15. * @author hightman <hightman@twomice.net>
  16. * @version 1.0.0
  17. * @package XS
  18. */
  19. class XSIndex extends XSServer
  20. {
  21. private $_buf = '';
  22. private $_bufSize = 0;
  23. private $_rebuild = false;
  24. private static $_adds = array();
  25. /**
  26. * 增加一个同步索引服务器
  27. * @param string $conn 索引服务端连接参数
  28. * @return XSServer
  29. * @throw XSException 出错时抛出异常
  30. */
  31. public function addServer($conn)
  32. {
  33. $srv = new XSServer($conn, $this->xs);
  34. self::$_adds[] = $srv;
  35. return $srv;
  36. }
  37. /**
  38. * 执行服务端指令并获取返回值
  39. * 重写此方法是为了同步到额外增加的多个索引服务端
  40. */
  41. public function execCommand($cmd, $res_arg = XS_CMD_NONE, $res_cmd = XS_CMD_OK)
  42. {
  43. $res = parent::execCommand($cmd, $res_arg, $res_cmd);
  44. foreach (self::$_adds as $srv) {
  45. $srv->execCommand($cmd, $res_arg, $res_cmd);
  46. }
  47. return $res;
  48. }
  49. /**
  50. * 完全清空索引数据
  51. * 如果当前数据库处于重建过程中将禁止清空
  52. * @return XSIndex 返回自身对象以支持串接操作
  53. * @see beginRebuild
  54. */
  55. public function clean()
  56. {
  57. $this->execCommand(XS_CMD_INDEX_CLEAN_DB, XS_CMD_OK_DB_CLEAN);
  58. return $this;
  59. }
  60. /**
  61. * 添加文档到索引中
  62. * 特别要注意的是: 系统不会自动检测主键是否冲突, 即便已存在相同主键也会添加进去
  63. * @param XSDocument $doc
  64. * @return XSIndex 返回自身对象以支持串接操作
  65. * @see update
  66. */
  67. public function add(XSDocument $doc)
  68. {
  69. return $this->update($doc, true);
  70. }
  71. /**
  72. * 更新索引文档
  73. * 该方法相当于先根据主键删除已存在的旧文档, 然后添加该文档
  74. * 如果你能明确认定是新文档, 则建议使用 {@link add}
  75. * @param XSDocument $doc
  76. * @param bool $add 是否为新增文档, 已有数据中不存在同一主键的其它数据
  77. * @return XSIndex 返回自身对象以支持串接操作
  78. */
  79. public function update(XSDocument $doc, $add = false)
  80. {
  81. // before submit
  82. if ($doc->beforeSubmit($this) === false) {
  83. return $this;
  84. }
  85. // check primary key of document
  86. $fid = $this->xs->getFieldId();
  87. $key = $doc->f($fid);
  88. if ($key === null || $key === '') {
  89. throw new XSException('Missing value of primary key (FIELD:' . $fid . ')');
  90. }
  91. // request cmd
  92. $cmd = new XSCommand(XS_CMD_INDEX_REQUEST, XS_CMD_INDEX_REQUEST_ADD);
  93. if ($add !== true) {
  94. $cmd->arg1 = XS_CMD_INDEX_REQUEST_UPDATE;
  95. $cmd->arg2 = $fid->vno;
  96. $cmd->buf = $key;
  97. }
  98. $cmds = array($cmd);
  99. // document cmds
  100. foreach ($this->xs->getAllFields() as $field) /* @var $field XSFieldMeta */ {
  101. // value
  102. if (($value = $doc->f($field)) !== null) {
  103. $varg = $field->isNumeric() ? XS_CMD_VALUE_FLAG_NUMERIC : 0;
  104. $value = $field->val($value);
  105. if (!$field->hasCustomTokenizer()) {
  106. // internal tokenizer
  107. $wdf = $field->weight | ($field->withPos() ? XS_CMD_INDEX_FLAG_WITHPOS : 0);
  108. if ($field->hasIndexMixed()) {
  109. $cmds[] = new XSCommand(XS_CMD_DOC_INDEX, $wdf, XSFieldScheme::MIXED_VNO, $value);
  110. }
  111. if ($field->hasIndexSelf()) {
  112. $wdf |= $field->isNumeric() ? 0 : XS_CMD_INDEX_FLAG_SAVEVALUE;
  113. $cmds[] = new XSCommand(XS_CMD_DOC_INDEX, $wdf, $field->vno, $value);
  114. }
  115. // add value
  116. if (!$field->hasIndexSelf() || $field->isNumeric()) {
  117. $cmds[] = new XSCommand(XS_CMD_DOC_VALUE, $varg, $field->vno, $value);
  118. }
  119. } else {
  120. // add index
  121. if ($field->hasIndex()) {
  122. $terms = $field->getCustomTokenizer()->getTokens($value, $doc);
  123. // self: [bool term, NOT weight, NOT stem, NOT pos]
  124. if ($field->hasIndexSelf()) {
  125. $wdf = $field->isBoolIndex() ? 1 : ($field->weight | XS_CMD_INDEX_FLAG_CHECKSTEM);
  126. foreach ($terms as $term) {
  127. if (strlen($term) > 200) {
  128. continue;
  129. }
  130. $term = strtolower($term);
  131. $cmds[] = new XSCommand(XS_CMD_DOC_TERM, $wdf, $field->vno, $term);
  132. }
  133. }
  134. // mixed: [use default tokenizer]
  135. if ($field->hasIndexMixed()) {
  136. $mtext = implode(' ', $terms);
  137. $cmds[] = new XSCommand(XS_CMD_DOC_INDEX, $field->weight, XSFieldScheme::MIXED_VNO, $mtext);
  138. }
  139. }
  140. // add value
  141. $cmds[] = new XSCommand(XS_CMD_DOC_VALUE, $varg, $field->vno, $value);
  142. }
  143. }
  144. // process add terms
  145. if (($terms = $doc->getAddTerms($field)) !== null) {
  146. // ignore weight for bool index
  147. $wdf1 = $field->isBoolIndex() ? 0 : XS_CMD_INDEX_FLAG_CHECKSTEM;
  148. foreach ($terms as $term => $wdf) {
  149. $term = strtolower($term);
  150. if (strlen($term) > 200) {
  151. continue;
  152. }
  153. $wdf2 = $field->isBoolIndex() ? 1 : $wdf * $field->weight;
  154. while ($wdf2 > XSFieldMeta::MAX_WDF) {
  155. $cmds[] = new XSCommand(XS_CMD_DOC_TERM, $wdf1 | XSFieldMeta::MAX_WDF, $field->vno, $term);
  156. $wdf2 -= XSFieldMeta::MAX_WDF;
  157. }
  158. $cmds[] = new XSCommand(XS_CMD_DOC_TERM, $wdf1 | $wdf2, $field->vno, $term);
  159. }
  160. }
  161. // process add text
  162. if (($text = $doc->getAddIndex($field)) !== null) {
  163. if (!$field->hasCustomTokenizer()) {
  164. $wdf = $field->weight | ($field->withPos() ? XS_CMD_INDEX_FLAG_WITHPOS : 0);
  165. $cmds[] = new XSCommand(XS_CMD_DOC_INDEX, $wdf, $field->vno, $text);
  166. } else {
  167. // NOT pos
  168. $wdf = $field->isBoolIndex() ? 1 : ($field->weight | XS_CMD_INDEX_FLAG_CHECKSTEM);
  169. $terms = $field->getCustomTokenizer()->getTokens($text, $doc);
  170. foreach ($terms as $term) {
  171. if (strlen($term) > 200) {
  172. continue;
  173. }
  174. $term = strtolower($term);
  175. $cmds[] = new XSCommand(XS_CMD_DOC_TERM, $wdf, $field->vno, $term);
  176. }
  177. }
  178. }
  179. }
  180. // submit cmd
  181. $cmds[] = new XSCommand(XS_CMD_INDEX_SUBMIT);
  182. // execute cmd
  183. if ($this->_bufSize > 0) {
  184. $this->appendBuffer(implode('', $cmds));
  185. } else {
  186. for ($i = 0; $i < count($cmds) - 1; $i++) {
  187. $this->execCommand($cmds[$i]);
  188. }
  189. $this->execCommand($cmds[$i], XS_CMD_OK_RQST_FINISHED);
  190. }
  191. // after submit
  192. $doc->afterSubmit($this);
  193. return $this;
  194. }
  195. /**
  196. * 删除索引中的数据
  197. * <pre>
  198. * $index->del('123'); // 删除主键为 123 的记录
  199. * $index->del(array('123', '789', '456')); // 删除主键为 123, 789, 456 的记录
  200. * $index->del('abc', 'field'); // 删除字段 field 上带有索引词 abc 的所有记录
  201. * $index->del(array('abc', 'def'), 'field'); // 删除字段 field 上带有索引词 abc 或 def 的所有记录
  202. * </pre>
  203. * @param mixed $term 单个主键或指定字段的索引词, 或多个组成的数组, 编码与 {@link xs} 默认字符集一致
  204. * @param string $field 索引词所属的字段名称, 默认不指定则为主键字段 (类型为ID)
  205. * @return XSIndex 返回自身对象以支持串接操作
  206. */
  207. public function del($term, $field = null)
  208. {
  209. // get field
  210. $field = $field === null ? $this->xs->getFieldId() : $this->xs->getField($field);
  211. // get commands
  212. $cmds = array();
  213. $terms = is_array($term) ? array_unique($term) : array($term);
  214. $terms = XS::convert($terms, 'UTF-8', $this->xs->getDefaultCharset());
  215. foreach ($terms as $term) {
  216. $cmds[] = new XSCommand(XS_CMD_INDEX_REMOVE, 0, $field->vno, strtolower($term));
  217. }
  218. // combine multi commands into exdata
  219. if ($this->_bufSize > 0) {
  220. $this->appendBuffer(implode('', $cmds));
  221. } elseif (count($cmds) == 1) {
  222. $this->execCommand($cmds[0], XS_CMD_OK_RQST_FINISHED);
  223. } else {
  224. $cmd = array('cmd' => XS_CMD_INDEX_EXDATA, 'buf' => implode('', $cmds));
  225. $this->execCommand($cmd, XS_CMD_OK_RQST_FINISHED);
  226. }
  227. return $this;
  228. }
  229. /**
  230. * 批量提交索引命令封包数据
  231. * 把多个命令封包内容连续保存为文件或变量, 然后一次性提交以减少网络开销提升性能
  232. * @param string $data 要提交的命令封包数据, 或存储命令封包的文件路径, 编码必须已经是 UTF-8
  233. * @param bool $check_file 是否检测参数为文件的情况
  234. * @return XSIndex 返回自身对象以支持串接操作
  235. * @throw XSException 出错时抛出异常
  236. */
  237. public function addExdata($data, $check_file = true)
  238. {
  239. if (strlen($data) < 255 && $check_file
  240. && file_exists($data) && ($data = file_get_contents($data)) === false) {
  241. throw new XSException('Failed to read exdata from file');
  242. }
  243. // try to check allowed (BUG: check the first cmd only):
  244. // XS_CMD_IMPORT_HEADER, XS_CMD_INDEX_REQUEST, XS_CMD_INDEX_REMOVE, XS_CMD_INDEX_EXDATA
  245. $first = ord(substr($data, 0, 1));
  246. if ($first != XS_CMD_IMPORT_HEADER
  247. && $first != XS_CMD_INDEX_REQUEST && $first != XS_CMD_INDEX_SYNONYMS
  248. && $first != XS_CMD_INDEX_REMOVE && $first != XS_CMD_INDEX_EXDATA) {
  249. throw new XSException('Invalid start command of exdata (CMD:' . $first . ')');
  250. }
  251. // create cmd & execute it
  252. $cmd = array('cmd' => XS_CMD_INDEX_EXDATA, 'buf' => $data);
  253. $this->execCommand($cmd, XS_CMD_OK_RQST_FINISHED);
  254. return $this;
  255. }
  256. /**
  257. * 添加同义词
  258. * @param string $raw 需要同义的原词, 英文词汇支持用空格分开多个单词并强制被转换为小写
  259. * @param string $synonym 同义词条, 最小语素, 勿带空格等分隔符
  260. * @return XSIndex 返回自身对象以支持串接操作
  261. * @throw XSException 出错时抛出异常
  262. * @since 1.3.0
  263. */
  264. public function addSynonym($raw, $synonym)
  265. {
  266. $raw = strval($raw);
  267. $synonym = strval($synonym);
  268. if ($raw !== '' && $synonym !== '') {
  269. $cmd = new XSCommand(XS_CMD_INDEX_SYNONYMS, XS_CMD_INDEX_SYNONYMS_ADD, 0, $raw, $synonym);
  270. if ($this->_bufSize > 0) {
  271. $this->appendBuffer(strval($cmd));
  272. } else {
  273. $this->execCommand($cmd, XS_CMD_OK_RQST_FINISHED);
  274. }
  275. }
  276. return $this;
  277. }
  278. /**
  279. * 删除某个同义词
  280. * @param string $raw 需要同义的原词, 英文词汇支持用空格分开多个单词并强制被转换为小写
  281. * @param string $synonym 要删除的同义词条, 默认 null 表示删除原词下的所有同义词
  282. * @return XSIndex 返回自身对象以支持串接操作
  283. * @throw XSException 出错时抛出异常
  284. * @since 1.3.0
  285. */
  286. public function delSynonym($raw, $synonym = null)
  287. {
  288. $raw = strval($raw);
  289. $synonym = $synonym === null ? '' : strval($synonym);
  290. if ($raw !== '') {
  291. $cmd = new XSCommand(XS_CMD_INDEX_SYNONYMS, XS_CMD_INDEX_SYNONYMS_DEL, 0, $raw, $synonym);
  292. if ($this->_bufSize > 0) {
  293. $this->appendBuffer(strval($cmd));
  294. } else {
  295. $this->execCommand($cmd, XS_CMD_OK_RQST_FINISHED);
  296. }
  297. }
  298. return $this;
  299. }
  300. /**
  301. * 设置当前索引库的分词复合等级
  302. * 复合等级是 scws 分词粒度控制的一个重要参数, 是长词细分处理依据, 默认为 3, 值范围 0~15
  303. * 注意: 这个设置仅直对当前索引库有效, 多次调用设置值被覆盖仅最后那次设置有效,
  304. * 而且仅对设置之后提交的索引数据起作用, 如需对以前的索引数据生效请重建索引.
  305. * @param int $level 要设置的分词复合等级
  306. * @return XSIndex 返回自身对象以支持串接操作
  307. * @since 1.4.7
  308. * @throw XSException 出错时抛出异常
  309. */
  310. public function setScwsMulti($level)
  311. {
  312. $level = intval($level);
  313. if ($level >= 0 && $level < 16) {
  314. $cmd = array('cmd' => XS_CMD_SEARCH_SCWS_SET, 'arg1' => XS_CMD_SCWS_SET_MULTI, 'arg2' => $level);
  315. $this->execCommand($cmd);
  316. }
  317. return $this;
  318. }
  319. /**
  320. * 获取当前索引库的分词复合等级
  321. * @return int 返回当前库的分词复合等级
  322. * @see setScwsMulti
  323. * @since 1.4.7
  324. */
  325. public function getScwsMulti()
  326. {
  327. $cmd = array('cmd' => XS_CMD_SEARCH_SCWS_GET, 'arg1' => XS_CMD_SCWS_GET_MULTI);
  328. $res = $this->execCommand($cmd, XS_CMD_OK_INFO);
  329. return intval($res->buf);
  330. }
  331. /**
  332. * 开启索引命令提交缓冲区
  333. * 为优化网络性能, 有必要先将本地提交的 add/update/del 等索引变动指令缓存下来,
  334. * 当总大小达到参数指定的 size 时或调用 {@link closeBuffer} 时再真正提交到服务器
  335. * 注意: 此举常用于需要大批量更新索引时, 此外重复调用本函数是无必要的
  336. * @param int $size 缓冲区大小, 单位: MB 默认为 4MB
  337. * @return XSIndex 返回自身对象以支持串接操作
  338. */
  339. public function openBuffer($size = 4)
  340. {
  341. if ($this->_buf !== '') {
  342. $this->addExdata($this->_buf, false);
  343. }
  344. $this->_bufSize = intval($size) << 20;
  345. $this->_buf = '';
  346. return $this;
  347. }
  348. /**
  349. * 提交所有指令并关闭缓冲区
  350. * 若未曾打开缓冲区, 调用本方法是无意义的
  351. * @return XSIndex 返回自身对象以支持串接操作
  352. * @see openBuffer
  353. */
  354. public function closeBuffer()
  355. {
  356. return $this->openBuffer(0);
  357. }
  358. /**
  359. * 开始重建索引
  360. * 此后所有的索引更新指令将写到临时库, 而不是当前搜索库, 重建完成后调用
  361. * {@link endRebuild} 实现平滑重建索引, 重建过程仍可搜索旧的索引库,
  362. * 如直接用 {@link clean} 清空数据, 则会导致重建过程搜索到不全的数据
  363. * @return XSIndex 返回自身对象以支持串接操作
  364. * @see endRebuild
  365. */
  366. public function beginRebuild()
  367. {
  368. $this->execCommand(array('cmd' => XS_CMD_INDEX_REBUILD, 'arg1' => 0), XS_CMD_OK_DB_REBUILD);
  369. $this->_rebuild = true;
  370. return $this;
  371. }
  372. /**
  373. * 完成并关闭重建索引
  374. * 重建完成后调用, 用重建好的索引数据代替旧的索引数据
  375. * @return XSIndex 返回自身对象以支持串接操作
  376. * @see beginRebuild
  377. */
  378. public function endRebuild()
  379. {
  380. if ($this->_rebuild === true) {
  381. $this->_rebuild = false;
  382. $this->execCommand(array('cmd' => XS_CMD_INDEX_REBUILD, 'arg1' => 1), XS_CMD_OK_DB_REBUILD);
  383. }
  384. return $this;
  385. }
  386. /**
  387. * 中止索引重建
  388. * 丢弃重建临时库的所有数据, 恢复成当前搜索库, 主要用于偶尔重建意外中止的情况
  389. * @return XSIndex 返回自身对象以支持串接操作
  390. * @see beginRebuild
  391. * @since 1.3.4
  392. */
  393. public function stopRebuild()
  394. {
  395. try {
  396. $this->execCommand(array('cmd' => XS_CMD_INDEX_REBUILD, 'arg1' => 2), XS_CMD_OK_DB_REBUILD);
  397. $this->_rebuild = false;
  398. } catch (XSException $e) {
  399. if ($e->getCode() !== XS_CMD_ERR_WRONGPLACE) {
  400. throw $e;
  401. }
  402. }
  403. return $this;
  404. }
  405. /**
  406. * 更改存放索引数据的目录
  407. * 默认索引数据保存到服务器上的 db 目录, 通过此方法修改数据目录名
  408. * @param string $name 数据库名称
  409. * @return XSIndex 返回自身对象以支持串接操作
  410. */
  411. public function setDb($name)
  412. {
  413. $this->execCommand(array('cmd' => XS_CMD_INDEX_SET_DB, 'buf' => $name), XS_CMD_OK_DB_CHANGED);
  414. return $this;
  415. }
  416. /**
  417. * 强制刷新服务端当前项目的搜索日志
  418. * @return bool 刷新成功返回 true, 失败则返回 false
  419. */
  420. public function flushLogging()
  421. {
  422. try {
  423. $this->execCommand(XS_CMD_FLUSH_LOGGING, XS_CMD_OK_LOG_FLUSHED);
  424. } catch (XSException $e) {
  425. if ($e->getCode() === XS_CMD_ERR_BUSY) {
  426. return false;
  427. }
  428. throw $e;
  429. }
  430. return true;
  431. }
  432. /**
  433. * 强制刷新服务端的当前库的索引缓存
  434. * @return bool 刷新成功返回 true, 失败则返回 false
  435. */
  436. public function flushIndex()
  437. {
  438. try {
  439. $this->execCommand(XS_CMD_INDEX_COMMIT, XS_CMD_OK_DB_COMMITED);
  440. } catch (XSException $e) {
  441. if ($e->getCode() === XS_CMD_ERR_BUSY || $e->getCode() === XS_CMD_ERR_RUNNING) {
  442. return false;
  443. }
  444. throw $e;
  445. }
  446. return true;
  447. }
  448. /**
  449. * 获取自定义词典内容
  450. * @return string 自定义词库内容
  451. * @throw XSException 出错时抛出异常
  452. */
  453. public function getCustomDict()
  454. {
  455. $res = $this->execCommand(XS_CMD_INDEX_USER_DICT, XS_CMD_OK_INFO);
  456. return $res->buf;
  457. }
  458. /**
  459. * 设置自定义词典内容
  460. * @param string $content 新的词典内容
  461. * @throw XSException 出错时抛出异常
  462. */
  463. public function setCustomDict($content)
  464. {
  465. $cmd = array('cmd' => XS_CMD_INDEX_USER_DICT, 'arg1' => 1, 'buf' => $content);
  466. $this->execCommand($cmd, XS_CMD_OK_DICT_SAVED);
  467. }
  468. /**
  469. * 关闭索引服务端连接
  470. */
  471. public function close($ioerr = false)
  472. {
  473. $this->closeBuffer();
  474. parent::close($ioerr);
  475. }
  476. /**
  477. * 追加缓冲区命令数据
  478. * 若增加后的数据长度达到缓冲区最大值则触发一次服务器提交.
  479. * @param string $buf 命令封包数据
  480. */
  481. private function appendBuffer($buf)
  482. {
  483. $this->_buf .= $buf;
  484. if (strlen($this->_buf) >= $this->_bufSize) {
  485. $this->addExdata($this->_buf, false);
  486. $this->_buf = '';
  487. }
  488. }
  489. /**
  490. * 析构函数
  491. * 在此自动关闭开启的 rebuild
  492. */
  493. public function __destruct()
  494. {
  495. if ($this->_rebuild === true) {
  496. try {
  497. $this->endRebuild();
  498. } catch (Exception $e) {
  499. }
  500. }
  501. foreach (self::$_adds as $srv) {
  502. $srv->close();
  503. }
  504. self::$_adds = array();
  505. parent::__destruct();
  506. }
  507. }