XSTokenizer.class.php 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. <?php
  2. /**
  3. * XSTokenizer 接口和内置分词器文件
  4. *
  5. * @author hightman
  6. * @link http://www.xunsearch.com/
  7. * @copyright Copyright &copy; 2011 HangZhou YunSheng Network Technology Co., Ltd.
  8. * @license http://www.xunsearch.com/license/
  9. * @version $Id$
  10. */
  11. /**
  12. * 自定义字段词法分析器接口
  13. * 系统将按照 {@link getTokens} 返回的词汇列表对相应的字段建立索引
  14. *
  15. * @author hightman <hightman@twomice.net>
  16. * @version 1.0.0
  17. * @package XS.tokenizer
  18. */
  19. interface XSTokenizer
  20. {
  21. /**
  22. * 内置分词器定义(常量)
  23. */
  24. const DFL = 0;
  25. /**
  26. * 执行分词并返回词列表
  27. * @param string $value 待分词的字段值(UTF-8编码)
  28. * @param XSDocument $doc 当前相关的索引文档
  29. * @return array 切好的词组成的数组
  30. */
  31. public function getTokens($value, XSDocument $doc = null);
  32. }
  33. /**
  34. * 内置空分词器
  35. *
  36. * @author hightman <hightman@twomice.net>
  37. * @version 1.0.0
  38. * @package XS.tokenizer
  39. */
  40. class XSTokenizerNone implements XSTokenizer
  41. {
  42. public function getTokens($value, XSDocument $doc = null)
  43. {
  44. return array();
  45. }
  46. }
  47. /**
  48. * 内置整值分词器
  49. *
  50. * @author hightman <hightman@twomice.net>
  51. * @version 1.0.0
  52. * @package XS.tokenizer
  53. */
  54. class XSTokenizerFull implements XSTokenizer
  55. {
  56. public function getTokens($value, XSDocument $doc = null)
  57. {
  58. return array($value);
  59. }
  60. }
  61. /**
  62. * 内置的分割分词器
  63. *
  64. * @author hightman <hightman@twomice.net>
  65. * @version 1.0.0
  66. * @package XS.tokenizer
  67. */
  68. class XSTokenizerSplit implements XSTokenizer
  69. {
  70. private $arg = ' ';
  71. public function __construct($arg = null)
  72. {
  73. if ($arg !== null && $arg !== '') {
  74. $this->arg = $arg;
  75. }
  76. }
  77. public function getTokens($value, XSDocument $doc = null)
  78. {
  79. if (strlen($this->arg) > 2 && substr($this->arg, 0, 1) == '/' && substr($this->arg, -1, 1) == '/') {
  80. return preg_split($this->arg, $value);
  81. }
  82. return explode($this->arg, $value);
  83. }
  84. }
  85. /**
  86. * 内置的定长分词器
  87. *
  88. * @author hightman <hightman@twomice.net>
  89. * @version 1.0.0
  90. * @package XS.tokenizer
  91. */
  92. class XSTokenizerXlen implements XSTokenizer
  93. {
  94. private $arg = 2;
  95. public function __construct($arg = null)
  96. {
  97. if ($arg !== null && $arg !== '') {
  98. $this->arg = intval($arg);
  99. if ($this->arg < 1 || $this->arg > 255) {
  100. throw new XSException('Invalid argument for ' . __CLASS__ . ': ' . $arg);
  101. }
  102. }
  103. }
  104. public function getTokens($value, XSDocument $doc = null)
  105. {
  106. $terms = array();
  107. for ($i = 0; $i < strlen($value); $i += $this->arg) {
  108. $terms[] = substr($value, $i, $this->arg);
  109. }
  110. return $terms;
  111. }
  112. }
  113. /**
  114. * 内置的步长分词器
  115. *
  116. * @author hightman <hightman@twomice.net>
  117. * @version 1.0.0
  118. * @package XS.tokenizer
  119. */
  120. class XSTokenizerXstep implements XSTokenizer
  121. {
  122. private $arg = 2;
  123. public function __construct($arg = null)
  124. {
  125. if ($arg !== null && $arg !== '') {
  126. $this->arg = intval($arg);
  127. if ($this->arg < 1 || $this->arg > 255) {
  128. throw new XSException('Invalid argument for ' . __CLASS__ . ': ' . $arg);
  129. }
  130. }
  131. }
  132. public function getTokens($value, XSDocument $doc = null)
  133. {
  134. $terms = array();
  135. $i = $this->arg;
  136. while (true) {
  137. $terms[] = substr($value, 0, $i);
  138. if ($i >= strlen($value)) {
  139. break;
  140. }
  141. $i += $this->arg;
  142. }
  143. return $terms;
  144. }
  145. }
  146. /**
  147. * SCWS - 分词器(与搜索服务端通讯)
  148. *
  149. * @author hightman <hightman@twomice.net>
  150. * @version 1.0.0
  151. * @package XS.tokenizer
  152. * @since 1.3.1
  153. */
  154. class XSTokenizerScws implements XSTokenizer
  155. {
  156. const MULTI_MASK = 15;
  157. /* @var string 字符集,默认为项目字符集 */
  158. private static $_charset;
  159. /* @var array 选项设置记录 */
  160. private $_setting = array();
  161. /* @var XSServer 分词服务对象 */
  162. private static $_server;
  163. /**
  164. * 构造函数
  165. * 初始化用于分词的搜索服务端
  166. * @param string $arg 复合等级参数,默认不指定
  167. */
  168. public function __construct($arg = null)
  169. {
  170. if (self::$_server === null) {
  171. $xs = XS::getLastXS();
  172. if ($xs === null) {
  173. throw new XSException('An XS instance should be created before using ' . __CLASS__);
  174. }
  175. self::$_server = $xs->getScwsServer();
  176. self::$_server->setTimeout(0);
  177. self::$_charset = $xs->getDefaultCharset();
  178. // constants
  179. if (!defined('SCWS_MULTI_NONE')) {
  180. define('SCWS_MULTI_NONE', 0);
  181. define('SCWS_MULTI_SHORT', 1);
  182. define('SCWS_MULTI_DUALITY', 2);
  183. define('SCWS_MULTI_ZMAIN', 4);
  184. define('SCWS_MULTI_ZALL', 8);
  185. }
  186. if (!defined('SCWS_XDICT_XDB')) {
  187. define('SCWS_XDICT_XDB', 1);
  188. define('SCWS_XDICT_MEM', 2);
  189. define('SCWS_XDICT_TXT', 4);
  190. }
  191. }
  192. if ($arg !== null && $arg !== '') {
  193. $this->setMulti($arg);
  194. }
  195. }
  196. /**
  197. * XSTokenizer 接口
  198. */
  199. public function getTokens($value, XSDocument $doc = null)
  200. {
  201. $tokens = array();
  202. $this->setIgnore(true);
  203. // save charset, force to use UTF-8
  204. $_charset = self::$_charset;
  205. self::$_charset = 'UTF-8';
  206. $words = $this->getResult($value);
  207. foreach ($words as $word) {
  208. $tokens[] = $word['word'];
  209. }
  210. // restore charset
  211. self::$_charset = $_charset;
  212. return $tokens;
  213. }
  214. /**
  215. * 设置字符集
  216. * 默认字符集是 UTF-8, 这是指 {@link getResult} 系列函数的 $text 参数的字符集
  217. * @param string $charset
  218. * @return XSTokenizerScws 返回对象本身以支持串接操作
  219. */
  220. public function setCharset($charset)
  221. {
  222. self::$_charset = strtoupper($charset);
  223. if (self::$_charset == 'UTF8') {
  224. self::$_charset = 'UTF-8';
  225. }
  226. return $this;
  227. }
  228. /**
  229. * 设置忽略标点符号
  230. * @param bool $yes 是否忽略
  231. * @return XSTokenizerScws 返回对象本身以支持串接操作
  232. */
  233. public function setIgnore($yes = true)
  234. {
  235. $this->_setting['ignore'] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_SET_IGNORE, $yes === false
  236. ? 0 : 1);
  237. return $this;
  238. }
  239. /**
  240. * 设置复合分词选项
  241. * @param int $mode 复合选项, 值范围 0~15
  242. * 默认为值为 3, 可使用常量组合:
  243. * SCWS_MULTI_SHORT|SCWS_MULTI_DUALITY|SCWS_MULTI_ZMAIN|SCWS_MULTI_ZALL
  244. * @return XSTokenizerScws 返回对象本身以支持串接操作
  245. */
  246. public function setMulti($mode = 3)
  247. {
  248. $mode = intval($mode) & self::MULTI_MASK;
  249. $this->_setting['multi'] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_SET_MULTI, $mode);
  250. return $this;
  251. }
  252. /**
  253. * 设置分词词典, 支持 TXT/XDB 格式
  254. * @param string $fpath 服务端的词典路径
  255. * @param int $mode 词典类型, 常量: SCWS_XDICT_XDB|SCWS_XDICT_TXT|SCWS_XDICT_MEM
  256. * @return XSTokenizerScws 返回对象本身以支持串接操作
  257. */
  258. public function setDict($fpath, $mode = null)
  259. {
  260. if (!is_int($mode)) {
  261. $mode = stripos($fpath, '.txt') !== false ? SCWS_XDICT_TXT : SCWS_XDICT_XDB;
  262. }
  263. $this->_setting['set_dict'] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_SET_DICT, $mode, $fpath);
  264. unset($this->_setting['add_dict']);
  265. return $this;
  266. }
  267. /**
  268. * 添加分词词典, 支持 TXT/XDB 格式
  269. * @param string $fpath 服务端的词典路径
  270. * @param int $mode 词典类型, 常量: SCWS_XDICT_XDB|SCWS_XDICT_TXT|SCWS_XDICT_MEM
  271. * @return XSTokenizerScws 返回对象本身以支持串接操作
  272. */
  273. public function addDict($fpath, $mode = null)
  274. {
  275. if (!is_int($mode)) {
  276. $mode = stripos($fpath, '.txt') !== false ? SCWS_XDICT_TXT : SCWS_XDICT_XDB;
  277. }
  278. if (!isset($this->_setting['add_dict'])) {
  279. $this->_setting['add_dict'] = array();
  280. }
  281. $this->_setting['add_dict'][] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_ADD_DICT, $mode, $fpath);
  282. return $this;
  283. }
  284. /**
  285. * 设置散字二元组合
  286. * @param bool $yes 是否开启散字自动二分组合功能
  287. * @return XSTokenizerScws 返回对象本身以支持串接操作
  288. */
  289. public function setDuality($yes = true)
  290. {
  291. $this->_setting['duality'] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_SET_DUALITY, $yes === false
  292. ? 0 : 1);
  293. return $this;
  294. }
  295. /**
  296. * 获取 scws 版本号
  297. * @return string 版本号
  298. */
  299. public function getVersion()
  300. {
  301. $cmd = new XSCommand(XS_CMD_SEARCH_SCWS_GET, XS_CMD_SCWS_GET_VERSION);
  302. $res = self::$_server->execCommand($cmd, XS_CMD_OK_INFO);
  303. return $res->buf;
  304. }
  305. /**
  306. * 获取分词结果
  307. * @param string $text 待分词的文本
  308. * @return array 返回词汇数组, 每个词汇是包含 [off:词在文本中的位置,attr:词性,word:词]
  309. */
  310. public function getResult($text)
  311. {
  312. $words = array();
  313. $text = $this->applySetting($text);
  314. $cmd = new XSCommand(XS_CMD_SEARCH_SCWS_GET, XS_CMD_SCWS_GET_RESULT, 0, $text);
  315. $res = self::$_server->execCommand($cmd, XS_CMD_OK_SCWS_RESULT);
  316. while ($res->buf !== '') {
  317. $tmp = unpack('Ioff/a4attr/a*word', $res->buf);
  318. $tmp['word'] = XS::convert($tmp['word'], self::$_charset, 'UTF-8');
  319. $words[] = $tmp;
  320. $res = self::$_server->getRespond();
  321. }
  322. return $words;
  323. }
  324. /**
  325. * 获取重要词统计结果
  326. * @param string $text 待分词的文本
  327. * @param string $xattr 在返回结果的词性过滤, 多个词性之间用逗号分隔, 以~开头取反
  328. * 如: 设为 n,v 表示只返回名词和动词; 设为 ~n,v 则表示返回名词和动词以外的其它词
  329. * @return array 返回词汇数组, 每个词汇是包含 [times:次数,attr:词性,word:词]
  330. */
  331. public function getTops($text, $limit = 10, $xattr = '')
  332. {
  333. $words = array();
  334. $text = $this->applySetting($text);
  335. $cmd = new XSCommand(XS_CMD_SEARCH_SCWS_GET, XS_CMD_SCWS_GET_TOPS, $limit, $text, $xattr);
  336. $res = self::$_server->execCommand($cmd, XS_CMD_OK_SCWS_TOPS);
  337. while ($res->buf !== '') {
  338. $tmp = unpack('Itimes/a4attr/a*word', $res->buf);
  339. $tmp['word'] = XS::convert($tmp['word'], self::$_charset, 'UTF-8');
  340. $words[] = $tmp;
  341. $res = self::$_server->getRespond();
  342. }
  343. return $words;
  344. }
  345. /**
  346. * 判断是否包含指定词性的词
  347. * @param string $text 要判断的文本
  348. * @param string $xattr 要判断的词性, 参见 {@link getTops} 的说明
  349. * @return bool 文本中是否包含指定词性的词汇
  350. */
  351. public function hasWord($text, $xattr)
  352. {
  353. $text = $this->applySetting($text);
  354. $cmd = new XSCommand(XS_CMD_SEARCH_SCWS_GET, XS_CMD_SCWS_HAS_WORD, 0, $text, $xattr);
  355. $res = self::$_server->execCommand($cmd, XS_CMD_OK_INFO);
  356. return $res->buf === 'OK';
  357. }
  358. private function applySetting($text)
  359. {
  360. self::$_server->reopen();
  361. foreach ($this->_setting as $key => $cmd) {
  362. if (is_array($cmd)) {
  363. foreach ($cmd as $_cmd) {
  364. self::$_server->execCommand($_cmd);
  365. }
  366. } else {
  367. self::$_server->execCommand($cmd);
  368. }
  369. }
  370. return XS::convert($text, 'UTF-8', self::$_charset);
  371. }
  372. }