123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404 |
- <?php
- /**
- * XSTokenizer 接口和内置分词器文件
- *
- * @author hightman
- * @link http://www.xunsearch.com/
- * @copyright Copyright © 2011 HangZhou YunSheng Network Technology Co., Ltd.
- * @license http://www.xunsearch.com/license/
- * @version $Id$
- */
- /**
- * 自定义字段词法分析器接口
- * 系统将按照 {@link getTokens} 返回的词汇列表对相应的字段建立索引
- *
- * @author hightman <hightman@twomice.net>
- * @version 1.0.0
- * @package XS.tokenizer
- */
- interface XSTokenizer
- {
- /**
- * 内置分词器定义(常量)
- */
- const DFL = 0;
- /**
- * 执行分词并返回词列表
- * @param string $value 待分词的字段值(UTF-8编码)
- * @param XSDocument $doc 当前相关的索引文档
- * @return array 切好的词组成的数组
- */
- public function getTokens($value, XSDocument $doc = null);
- }
- /**
- * 内置空分词器
- *
- * @author hightman <hightman@twomice.net>
- * @version 1.0.0
- * @package XS.tokenizer
- */
- class XSTokenizerNone implements XSTokenizer
- {
- public function getTokens($value, XSDocument $doc = null)
- {
- return array();
- }
- }
- /**
- * 内置整值分词器
- *
- * @author hightman <hightman@twomice.net>
- * @version 1.0.0
- * @package XS.tokenizer
- */
- class XSTokenizerFull implements XSTokenizer
- {
- public function getTokens($value, XSDocument $doc = null)
- {
- return array($value);
- }
- }
- /**
- * 内置的分割分词器
- *
- * @author hightman <hightman@twomice.net>
- * @version 1.0.0
- * @package XS.tokenizer
- */
- class XSTokenizerSplit implements XSTokenizer
- {
- private $arg = ' ';
- public function __construct($arg = null)
- {
- if ($arg !== null && $arg !== '') {
- $this->arg = $arg;
- }
- }
- public function getTokens($value, XSDocument $doc = null)
- {
- if (strlen($this->arg) > 2 && substr($this->arg, 0, 1) == '/' && substr($this->arg, -1, 1) == '/') {
- return preg_split($this->arg, $value);
- }
- return explode($this->arg, $value);
- }
- }
- /**
- * 内置的定长分词器
- *
- * @author hightman <hightman@twomice.net>
- * @version 1.0.0
- * @package XS.tokenizer
- */
- class XSTokenizerXlen implements XSTokenizer
- {
- private $arg = 2;
- public function __construct($arg = null)
- {
- if ($arg !== null && $arg !== '') {
- $this->arg = intval($arg);
- if ($this->arg < 1 || $this->arg > 255) {
- throw new XSException('Invalid argument for ' . __CLASS__ . ': ' . $arg);
- }
- }
- }
- public function getTokens($value, XSDocument $doc = null)
- {
- $terms = array();
- for ($i = 0; $i < strlen($value); $i += $this->arg) {
- $terms[] = substr($value, $i, $this->arg);
- }
- return $terms;
- }
- }
- /**
- * 内置的步长分词器
- *
- * @author hightman <hightman@twomice.net>
- * @version 1.0.0
- * @package XS.tokenizer
- */
- class XSTokenizerXstep implements XSTokenizer
- {
- private $arg = 2;
- public function __construct($arg = null)
- {
- if ($arg !== null && $arg !== '') {
- $this->arg = intval($arg);
- if ($this->arg < 1 || $this->arg > 255) {
- throw new XSException('Invalid argument for ' . __CLASS__ . ': ' . $arg);
- }
- }
- }
- public function getTokens($value, XSDocument $doc = null)
- {
- $terms = array();
- $i = $this->arg;
- while (true) {
- $terms[] = substr($value, 0, $i);
- if ($i >= strlen($value)) {
- break;
- }
- $i += $this->arg;
- }
- return $terms;
- }
- }
- /**
- * SCWS - 分词器(与搜索服务端通讯)
- *
- * @author hightman <hightman@twomice.net>
- * @version 1.0.0
- * @package XS.tokenizer
- * @since 1.3.1
- */
- class XSTokenizerScws implements XSTokenizer
- {
- const MULTI_MASK = 15;
- /* @var string 字符集,默认为项目字符集 */
- private static $_charset;
- /* @var array 选项设置记录 */
- private $_setting = array();
- /* @var XSServer 分词服务对象 */
- private static $_server;
- /**
- * 构造函数
- * 初始化用于分词的搜索服务端
- * @param string $arg 复合等级参数,默认不指定
- */
- public function __construct($arg = null)
- {
- if (self::$_server === null) {
- $xs = XS::getLastXS();
- if ($xs === null) {
- throw new XSException('An XS instance should be created before using ' . __CLASS__);
- }
- self::$_server = $xs->getScwsServer();
- self::$_server->setTimeout(0);
- self::$_charset = $xs->getDefaultCharset();
- // constants
- if (!defined('SCWS_MULTI_NONE')) {
- define('SCWS_MULTI_NONE', 0);
- define('SCWS_MULTI_SHORT', 1);
- define('SCWS_MULTI_DUALITY', 2);
- define('SCWS_MULTI_ZMAIN', 4);
- define('SCWS_MULTI_ZALL', 8);
- }
- if (!defined('SCWS_XDICT_XDB')) {
- define('SCWS_XDICT_XDB', 1);
- define('SCWS_XDICT_MEM', 2);
- define('SCWS_XDICT_TXT', 4);
- }
- }
- if ($arg !== null && $arg !== '') {
- $this->setMulti($arg);
- }
- }
- /**
- * XSTokenizer 接口
- */
- public function getTokens($value, XSDocument $doc = null)
- {
- $tokens = array();
- $this->setIgnore(true);
- // save charset, force to use UTF-8
- $_charset = self::$_charset;
- self::$_charset = 'UTF-8';
- $words = $this->getResult($value);
- foreach ($words as $word) {
- $tokens[] = $word['word'];
- }
- // restore charset
- self::$_charset = $_charset;
- return $tokens;
- }
- /**
- * 设置字符集
- * 默认字符集是 UTF-8, 这是指 {@link getResult} 系列函数的 $text 参数的字符集
- * @param string $charset
- * @return XSTokenizerScws 返回对象本身以支持串接操作
- */
- public function setCharset($charset)
- {
- self::$_charset = strtoupper($charset);
- if (self::$_charset == 'UTF8') {
- self::$_charset = 'UTF-8';
- }
- return $this;
- }
- /**
- * 设置忽略标点符号
- * @param bool $yes 是否忽略
- * @return XSTokenizerScws 返回对象本身以支持串接操作
- */
- public function setIgnore($yes = true)
- {
- $this->_setting['ignore'] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_SET_IGNORE, $yes === false
- ? 0 : 1);
- return $this;
- }
- /**
- * 设置复合分词选项
- * @param int $mode 复合选项, 值范围 0~15
- * 默认为值为 3, 可使用常量组合:
- * SCWS_MULTI_SHORT|SCWS_MULTI_DUALITY|SCWS_MULTI_ZMAIN|SCWS_MULTI_ZALL
- * @return XSTokenizerScws 返回对象本身以支持串接操作
- */
- public function setMulti($mode = 3)
- {
- $mode = intval($mode) & self::MULTI_MASK;
- $this->_setting['multi'] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_SET_MULTI, $mode);
- return $this;
- }
- /**
- * 设置分词词典, 支持 TXT/XDB 格式
- * @param string $fpath 服务端的词典路径
- * @param int $mode 词典类型, 常量: SCWS_XDICT_XDB|SCWS_XDICT_TXT|SCWS_XDICT_MEM
- * @return XSTokenizerScws 返回对象本身以支持串接操作
- */
- public function setDict($fpath, $mode = null)
- {
- if (!is_int($mode)) {
- $mode = stripos($fpath, '.txt') !== false ? SCWS_XDICT_TXT : SCWS_XDICT_XDB;
- }
- $this->_setting['set_dict'] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_SET_DICT, $mode, $fpath);
- unset($this->_setting['add_dict']);
- return $this;
- }
- /**
- * 添加分词词典, 支持 TXT/XDB 格式
- * @param string $fpath 服务端的词典路径
- * @param int $mode 词典类型, 常量: SCWS_XDICT_XDB|SCWS_XDICT_TXT|SCWS_XDICT_MEM
- * @return XSTokenizerScws 返回对象本身以支持串接操作
- */
- public function addDict($fpath, $mode = null)
- {
- if (!is_int($mode)) {
- $mode = stripos($fpath, '.txt') !== false ? SCWS_XDICT_TXT : SCWS_XDICT_XDB;
- }
- if (!isset($this->_setting['add_dict'])) {
- $this->_setting['add_dict'] = array();
- }
- $this->_setting['add_dict'][] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_ADD_DICT, $mode, $fpath);
- return $this;
- }
- /**
- * 设置散字二元组合
- * @param bool $yes 是否开启散字自动二分组合功能
- * @return XSTokenizerScws 返回对象本身以支持串接操作
- */
- public function setDuality($yes = true)
- {
- $this->_setting['duality'] = new XSCommand(XS_CMD_SEARCH_SCWS_SET, XS_CMD_SCWS_SET_DUALITY, $yes === false
- ? 0 : 1);
- return $this;
- }
- /**
- * 获取 scws 版本号
- * @return string 版本号
- */
- public function getVersion()
- {
- $cmd = new XSCommand(XS_CMD_SEARCH_SCWS_GET, XS_CMD_SCWS_GET_VERSION);
- $res = self::$_server->execCommand($cmd, XS_CMD_OK_INFO);
- return $res->buf;
- }
- /**
- * 获取分词结果
- * @param string $text 待分词的文本
- * @return array 返回词汇数组, 每个词汇是包含 [off:词在文本中的位置,attr:词性,word:词]
- */
- public function getResult($text)
- {
- $words = array();
- $text = $this->applySetting($text);
- $cmd = new XSCommand(XS_CMD_SEARCH_SCWS_GET, XS_CMD_SCWS_GET_RESULT, 0, $text);
- $res = self::$_server->execCommand($cmd, XS_CMD_OK_SCWS_RESULT);
- while ($res->buf !== '') {
- $tmp = unpack('Ioff/a4attr/a*word', $res->buf);
- $tmp['word'] = XS::convert($tmp['word'], self::$_charset, 'UTF-8');
- $words[] = $tmp;
- $res = self::$_server->getRespond();
- }
- return $words;
- }
- /**
- * 获取重要词统计结果
- * @param string $text 待分词的文本
- * @param string $xattr 在返回结果的词性过滤, 多个词性之间用逗号分隔, 以~开头取反
- * 如: 设为 n,v 表示只返回名词和动词; 设为 ~n,v 则表示返回名词和动词以外的其它词
- * @return array 返回词汇数组, 每个词汇是包含 [times:次数,attr:词性,word:词]
- */
- public function getTops($text, $limit = 10, $xattr = '')
- {
- $words = array();
- $text = $this->applySetting($text);
- $cmd = new XSCommand(XS_CMD_SEARCH_SCWS_GET, XS_CMD_SCWS_GET_TOPS, $limit, $text, $xattr);
- $res = self::$_server->execCommand($cmd, XS_CMD_OK_SCWS_TOPS);
- while ($res->buf !== '') {
- $tmp = unpack('Itimes/a4attr/a*word', $res->buf);
- $tmp['word'] = XS::convert($tmp['word'], self::$_charset, 'UTF-8');
- $words[] = $tmp;
- $res = self::$_server->getRespond();
- }
- return $words;
- }
- /**
- * 判断是否包含指定词性的词
- * @param string $text 要判断的文本
- * @param string $xattr 要判断的词性, 参见 {@link getTops} 的说明
- * @return bool 文本中是否包含指定词性的词汇
- */
- public function hasWord($text, $xattr)
- {
- $text = $this->applySetting($text);
- $cmd = new XSCommand(XS_CMD_SEARCH_SCWS_GET, XS_CMD_SCWS_HAS_WORD, 0, $text, $xattr);
- $res = self::$_server->execCommand($cmd, XS_CMD_OK_INFO);
- return $res->buf === 'OK';
- }
- private function applySetting($text)
- {
- self::$_server->reopen();
- foreach ($this->_setting as $key => $cmd) {
- if (is_array($cmd)) {
- foreach ($cmd as $_cmd) {
- self::$_server->execCommand($_cmd);
- }
- } else {
- self::$_server->execCommand($cmd);
- }
- }
- return XS::convert($text, 'UTF-8', self::$_charset);
- }
- }
|