XSFieldScheme.class.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. <?php
  2. /**
  3. * XSFieldScheme 类定义文件
  4. *
  5. * @author hightman
  6. * @link http://www.xunsearch.com/
  7. * @copyright Copyright &copy; 2011 HangZhou YunSheng Network Technology Co., Ltd.
  8. * @license http://www.xunsearch.com/license/
  9. * @version $Id$
  10. */
  11. /**
  12. * XS 数据字段方案
  13. * 每个方案包含若干个字段结构对象 {@link XSFieldMeta}
  14. * 每个方案必须并且只能包含一个类型为 ID 的字段, 支持 foreach 遍历所有字段
  15. *
  16. * @author hightman <hightman@twomice.net>
  17. * @version 1.0.0
  18. * @package XS
  19. */
  20. class XSFieldScheme implements IteratorAggregate
  21. {
  22. const MIXED_VNO = 255;
  23. private $_fields = array();
  24. private $_typeMap = array();
  25. private $_vnoMap = array();
  26. private static $_logger;
  27. /**
  28. * 将对象转换为配置文件字符串
  29. */
  30. public function __toString()
  31. {
  32. $str = '';
  33. foreach ($this->_fields as $field) {
  34. $str .= $field->toConfig() . "\n";
  35. }
  36. return $str;
  37. }
  38. /**
  39. * 获取主键字段元数据
  40. * @return XSFieldMeta 类型为 ID 的字段
  41. */
  42. public function getFieldId()
  43. {
  44. if (isset($this->_typeMap[XSFieldMeta::TYPE_ID])) {
  45. $name = $this->_typeMap[XSFieldMeta::TYPE_ID];
  46. return $this->_fields[$name];
  47. }
  48. return false;
  49. }
  50. /**
  51. * 获取标题字段元数据
  52. * @return XSFieldMeta 类型为 TITLE 的字段
  53. */
  54. public function getFieldTitle()
  55. {
  56. if (isset($this->_typeMap[XSFieldMeta::TYPE_TITLE])) {
  57. $name = $this->_typeMap[XSFieldMeta::TYPE_TITLE];
  58. return $this->_fields[$name];
  59. }
  60. foreach ($this->_fields as $name => $field) {
  61. if ($field->type === XSFieldMeta::TYPE_STRING && !$field->isBoolIndex()) {
  62. return $field;
  63. }
  64. }
  65. return false;
  66. }
  67. /**
  68. * 获取内容字段元数据
  69. * @return XSFieldMeta 类型为 BODY 的字段
  70. */
  71. public function getFieldBody()
  72. {
  73. if (isset($this->_typeMap[XSFieldMeta::TYPE_BODY])) {
  74. $name = $this->_typeMap[XSFieldMeta::TYPE_BODY];
  75. return $this->_fields[$name];
  76. }
  77. return false;
  78. }
  79. /**
  80. * 获取项目字段元数据
  81. * @param mixed $name 字段名称(string) 或字段序号(vno, int)
  82. * @param bool $throw 当字段不存在时是否抛出异常, 默认为 true
  83. * @return XSFieldMeta 字段元数据对象, 若不存在则返回 false
  84. * @throw XSException 当字段不存在并且参数 throw 为 true 时抛出异常
  85. */
  86. public function getField($name, $throw = true)
  87. {
  88. if (is_int($name)) {
  89. if (!isset($this->_vnoMap[$name])) {
  90. if ($throw === true) {
  91. throw new XSException('Not exists field with vno: `' . $name . '\'');
  92. }
  93. return false;
  94. }
  95. $name = $this->_vnoMap[$name];
  96. }
  97. if (!isset($this->_fields[$name])) {
  98. if ($throw === true) {
  99. throw new XSException('Not exists field with name: `' . $name . '\'');
  100. }
  101. return false;
  102. }
  103. return $this->_fields[$name];
  104. }
  105. /**
  106. * 获取项目所有字段结构设置
  107. * @return XSFieldMeta[]
  108. */
  109. public function getAllFields()
  110. {
  111. return $this->_fields;
  112. }
  113. /**
  114. * 获取所有字段的vno与名称映映射关系
  115. * @return array vno为键, 字段名为值的数组
  116. */
  117. public function getVnoMap()
  118. {
  119. return $this->_vnoMap;
  120. }
  121. /**
  122. * 添加字段到方案中
  123. * 每个方案中的特殊类型字段都不能重复出现
  124. * @param mixed $field 若类型为 XSFieldMeta 表示要添加的字段对象,
  125. * 若类型为 string 表示字段名称, 连同 $config 参数一起创建字段对象
  126. * @param array $config 当 $field 参数为 string 时作为新建字段的配置内容
  127. * @throw XSException 出现逻辑错误时抛出异常
  128. */
  129. public function addField($field, $config = null)
  130. {
  131. if (!$field instanceof XSFieldMeta) {
  132. $field = new XSFieldMeta($field, $config);
  133. }
  134. if (isset($this->_fields[$field->name])) {
  135. throw new XSException('Duplicated field name: `' . $field->name . '\'');
  136. }
  137. if ($field->isSpeical()) {
  138. if (isset($this->_typeMap[$field->type])) {
  139. $prev = $this->_typeMap[$field->type];
  140. throw new XSException('Duplicated ' . strtoupper($config['type']) . ' field: `' . $field->name . '\' and `' . $prev . '\'');
  141. }
  142. $this->_typeMap[$field->type] = $field->name;
  143. }
  144. $field->vno = ($field->type == XSFieldMeta::TYPE_BODY) ? self::MIXED_VNO : count($this->_vnoMap);
  145. $this->_vnoMap[$field->vno] = $field->name;
  146. // save field, ensure ID is the first field
  147. if ($field->type == XSFieldMeta::TYPE_ID) {
  148. $this->_fields = array_merge(array($field->name => $field), $this->_fields);
  149. } else {
  150. $this->_fields[$field->name] = $field;
  151. }
  152. }
  153. /**
  154. * 判断该字段方案是否有效、可用
  155. * 每个方案必须并且只能包含一个类型为 ID 的字段
  156. * @param bool $throw 当没有通过检测时是否抛出异常, 默认为 false
  157. * @return bool 有效返回 true, 无效则返回 false
  158. * @throw XSException 当检测不通过并且参数 throw 为 true 时抛了异常
  159. */
  160. public function checkValid($throw = false)
  161. {
  162. if (!isset($this->_typeMap[XSFieldMeta::TYPE_ID])) {
  163. if ($throw) {
  164. throw new XSException('Missing field of type ID');
  165. }
  166. return false;
  167. }
  168. return true;
  169. }
  170. /**
  171. * IteratorAggregate 接口, 以支持 foreach 遍历访问所有字段
  172. */
  173. public function getIterator()
  174. {
  175. return new ArrayIterator($this->_fields);
  176. }
  177. /**
  178. * 获取搜索日志的字段方案
  179. * @return XSFieldScheme 搜索日志字段方案
  180. */
  181. public static function logger()
  182. {
  183. if (self::$_logger === null) {
  184. $scheme = new self;
  185. $scheme->addField('id', array('type' => 'id'));
  186. $scheme->addField('pinyin');
  187. $scheme->addField('partial');
  188. $scheme->addField('total', array('type' => 'numeric', 'index' => 'self'));
  189. $scheme->addField('lastnum', array('type' => 'numeric', 'index' => 'self'));
  190. $scheme->addField('currnum', array('type' => 'numeric', 'index' => 'self'));
  191. $scheme->addField('currtag', array('type' => 'string'));
  192. $scheme->addField('body', array('type' => 'body'));
  193. self::$_logger = $scheme;
  194. }
  195. return self::$_logger;
  196. }
  197. }
  198. /**
  199. * 数据字段结构元数据
  200. * 每个搜索项目包含若干个字段, 字段元数据保存在项目的 ini 配置文件中
  201. *
  202. * @author hightman <hightman@twomice.net>
  203. * @version 1.0.0
  204. * @package XS
  205. * @see XS::loadIniFile()
  206. */
  207. class XSFieldMeta
  208. {
  209. /**
  210. * 词条权重最大值
  211. */
  212. const MAX_WDF = 0x3f;
  213. /**
  214. * 字段类型常量定义
  215. */
  216. const TYPE_STRING = 0;
  217. const TYPE_NUMERIC = 1;
  218. const TYPE_DATE = 2;
  219. const TYPE_ID = 10;
  220. const TYPE_TITLE = 11;
  221. const TYPE_BODY = 12;
  222. /**
  223. * 索引标志常量定义
  224. */
  225. const FLAG_INDEX_SELF = 0x01;
  226. const FLAG_INDEX_MIXED = 0x02;
  227. const FLAG_INDEX_BOTH = 0x03;
  228. const FLAG_WITH_POSITION = 0x10;
  229. const FLAG_NON_BOOL = 0x80; // 强制让该字段参与权重计算 (非布尔)
  230. /**
  231. * @var string 字段名称
  232. * 理论上支持各种可视字符, 推荐字符范围:[0-9A-Za-z-_], 长度控制在 1~32 字节为宜
  233. */
  234. public $name;
  235. /**
  236. * @var int 剪取长度 (单位:字节)
  237. * 用于在返回搜索结果自动剪取较长内容的字段, 默认为 0表示不截取, body 型字段默认为 300 字节
  238. */
  239. public $cutlen = 0;
  240. /**
  241. * @var int 混合区检索时的相对权重
  242. * 取值范围: 1~63, title 类型的字段默认为 5, 其它字段默认为 1
  243. */
  244. public $weight = 1;
  245. /**
  246. * @var int 字段类型
  247. */
  248. public $type = 0;
  249. /**
  250. * @var int 字段序号
  251. * 取值为 0~255, 同一字段方案内不能重复, 由 {@link XSFieldScheme::addField} 进行确定
  252. */
  253. public $vno = 0;
  254. /**
  255. * @var string 词法分析器
  256. */
  257. private $tokenizer = XSTokenizer::DFL;
  258. /**
  259. * @var integer 索引标志设置
  260. */
  261. private $flag = 0;
  262. /**
  263. * @staticvar XSTokenizer[] 分词器实例缓存
  264. */
  265. private static $_tokenizers = array();
  266. /**
  267. * 构造函数
  268. * @param string $name 字段名称
  269. * @param array $config 可选参数, 初始化字段各项配置
  270. */
  271. public function __construct($name, $config = null)
  272. {
  273. $this->name = strval($name);
  274. if (is_array($config)) {
  275. $this->fromConfig($config);
  276. }
  277. }
  278. /**
  279. * 将对象转换为字符串
  280. * @return string 字段名称
  281. */
  282. public function __toString()
  283. {
  284. return $this->name;
  285. }
  286. /**
  287. * 把给定的值转换为符合这个字段的数据格式
  288. * @param mixed $value 原值
  289. * @return mixed 转换后的值
  290. */
  291. public function val($value)
  292. {
  293. if ($this->type == self::TYPE_DATE) {
  294. // 日期类型: 转换成专用的 YYYYmmdd 格式
  295. if (!is_numeric($value) || strlen($value) !== 8) {
  296. $value = date('Ymd', is_numeric($value) ? $value : strtotime($value));
  297. }
  298. }
  299. return $value;
  300. }
  301. /**
  302. * 判断当前字段索引是否支持短语搜索
  303. * @return bool 是返回 true, 不是返回 false
  304. */
  305. public function withPos()
  306. {
  307. return ($this->flag & self::FLAG_WITH_POSITION) ? true : false;
  308. }
  309. /**
  310. * 判断当前字段的索引是否为布尔型
  311. * 目前只有内置分词器支持语法型索引, 自 1.0.1 版本起把非索引字段也视为布尔便于判断
  312. * @return bool 是返回 true, 不是返回 false
  313. */
  314. public function isBoolIndex()
  315. {
  316. if ($this->flag & self::FLAG_NON_BOOL) {
  317. return false;
  318. }
  319. return (!$this->hasIndex() || $this->tokenizer !== XSTokenizer::DFL);
  320. }
  321. /**
  322. * 判断当前字段是否为数字型
  323. * @return bool 是返回 true, 不是返回 false
  324. */
  325. public function isNumeric()
  326. {
  327. return ($this->type == self::TYPE_NUMERIC);
  328. }
  329. /**
  330. * 判断当前字段是否为特殊类型
  331. * 特殊类型的字段是指 id, title, body, 每个项目至多只能有一个这种类型的字段
  332. * @return bool 是返回 true, 不是返回 false
  333. */
  334. public function isSpeical()
  335. {
  336. return ($this->type == self::TYPE_ID || $this->type == self::TYPE_TITLE || $this->type == self::TYPE_BODY);
  337. }
  338. /**
  339. * 判断当前字段是否需要索引
  340. * @return bool 若需要返回 true, 不需要则返回 false
  341. */
  342. public function hasIndex()
  343. {
  344. return ($this->flag & self::FLAG_INDEX_BOTH) ? true : false;
  345. }
  346. /**
  347. * 判断当前字段是否需要在混合区索引
  348. * @return bool 若需要返回 true, 不需要则返回 false
  349. */
  350. public function hasIndexMixed()
  351. {
  352. return ($this->flag & self::FLAG_INDEX_MIXED) ? true : false;
  353. }
  354. /**
  355. * 判断当前字段是否需要在字段区索引
  356. * @return bool 若需要返回 true, 不需要则返回 false
  357. */
  358. public function hasIndexSelf()
  359. {
  360. return ($this->flag & self::FLAG_INDEX_SELF) ? true : false;
  361. }
  362. /**
  363. * 判断当前字段是否采用自定义分词器
  364. * @return bool 是返回 true, 不是返回 false
  365. */
  366. public function hasCustomTokenizer()
  367. {
  368. return ($this->tokenizer !== XSTokenizer::DFL);
  369. }
  370. /**
  371. * 获取自定义词法分析器
  372. * 自 1.4.8 起会自动加载 lib 或当前目录下的 XSTokenizer???.class.php
  373. * @return XSTokenizer 获取当前字段的自定义词法分析器
  374. * @throw XSException 如果分词器不存在或有出错抛出异常
  375. */
  376. public function getCustomTokenizer()
  377. {
  378. if (isset(self::$_tokenizers[$this->tokenizer])) {
  379. return self::$_tokenizers[$this->tokenizer];
  380. } else {
  381. if (($pos1 = strpos($this->tokenizer, '(')) !== false
  382. && ($pos2 = strrpos($this->tokenizer, ')', $pos1 + 1))) {
  383. $name = 'XSTokenizer' . ucfirst(trim(substr($this->tokenizer, 0, $pos1)));
  384. $arg = substr($this->tokenizer, $pos1 + 1, $pos2 - $pos1 - 1);
  385. } else {
  386. $name = 'XSTokenizer' . ucfirst($this->tokenizer);
  387. $arg = null;
  388. }
  389. if (!class_exists($name)) {
  390. $file = $name . '.class.php';
  391. if (file_exists($file)) {
  392. require_once $file;
  393. } else if (file_exists(XS_LIB_ROOT . DIRECTORY_SEPARATOR . $file)) {
  394. require_once XS_LIB_ROOT . DIRECTORY_SEPARATOR . $file;
  395. }
  396. if (!class_exists($name)) {
  397. throw new XSException('Undefined custom tokenizer `' . $this->tokenizer . '\' for field `' . $this->name . '\'');
  398. }
  399. }
  400. $obj = $arg === null ? new $name : new $name($arg);
  401. if (!$obj instanceof XSTokenizer) {
  402. throw new XSException($name . ' for field `' . $this->name . '\' dose not implement the interface: XSTokenizer');
  403. }
  404. self::$_tokenizers[$this->tokenizer] = $obj;
  405. return $obj;
  406. }
  407. }
  408. /**
  409. * 将对象转换为配置文件字符串
  410. * @return string 转换后的配置文件字符串
  411. */
  412. public function toConfig()
  413. {
  414. // type
  415. $str = "[" . $this->name . "]\n";
  416. if ($this->type === self::TYPE_NUMERIC) {
  417. $str .= "type = numeric\n";
  418. } elseif ($this->type === self::TYPE_DATE) {
  419. $str .= "type = date\n";
  420. } elseif ($this->type === self::TYPE_ID) {
  421. $str .= "type = id\n";
  422. } elseif ($this->type === self::TYPE_TITLE) {
  423. $str .= "type = title\n";
  424. } elseif ($this->type === self::TYPE_BODY) {
  425. $str .= "type = body\n";
  426. }
  427. // index
  428. if ($this->type !== self::TYPE_BODY && ($index = ($this->flag & self::FLAG_INDEX_BOTH))) {
  429. if ($index === self::FLAG_INDEX_BOTH) {
  430. if ($this->type !== self::TYPE_TITLE) {
  431. $str .= "index = both\n";
  432. }
  433. } elseif ($index === self::FLAG_INDEX_MIXED) {
  434. $str .= "index = mixed\n";
  435. } else {
  436. if ($this->type !== self::TYPE_ID) {
  437. $str .= "index = self\n";
  438. }
  439. }
  440. }
  441. // tokenizer
  442. if ($this->type !== self::TYPE_ID && $this->tokenizer !== XSTokenizer::DFL) {
  443. $str .= "tokenizer = " . $this->tokenizer . "\n";
  444. }
  445. // cutlen
  446. if ($this->cutlen > 0 && !($this->cutlen === 300 && $this->type === self::TYPE_BODY)) {
  447. $str .= "cutlen = " . $this->cutlen . "\n";
  448. }
  449. // weight
  450. if ($this->weight !== 1 && !($this->weight === 5 && $this->type === self::TYPE_TITLE)) {
  451. $str .= "weight = " . $this->weight . "\n";
  452. }
  453. // phrase
  454. if ($this->flag & self::FLAG_WITH_POSITION) {
  455. if ($this->type !== self::TYPE_BODY && $this->type !== self::TYPE_TITLE) {
  456. $str .= "phrase = yes\n";
  457. }
  458. } else {
  459. if ($this->type === self::TYPE_BODY || $this->type === self::TYPE_TITLE) {
  460. $str .= "phrase = no\n";
  461. }
  462. }
  463. // non-bool
  464. if ($this->flag & self::FLAG_NON_BOOL) {
  465. $str .= "non_bool = yes\n";
  466. }
  467. return $str;
  468. }
  469. /**
  470. * 解析字段对象属性
  471. * @param array $config 原始配置属性数组
  472. */
  473. public function fromConfig($config)
  474. {
  475. // type & default setting
  476. if (isset($config['type'])) {
  477. $predef = 'self::TYPE_' . strtoupper($config['type']);
  478. if (defined($predef)) {
  479. $this->type = constant($predef);
  480. if ($this->type == self::TYPE_ID) {
  481. $this->flag = self::FLAG_INDEX_SELF;
  482. $this->tokenizer = 'full';
  483. } elseif ($this->type == self::TYPE_TITLE) {
  484. $this->flag = self::FLAG_INDEX_BOTH | self::FLAG_WITH_POSITION;
  485. $this->weight = 5;
  486. } elseif ($this->type == self::TYPE_BODY) {
  487. $this->vno = XSFieldScheme::MIXED_VNO;
  488. $this->flag = self::FLAG_INDEX_SELF | self::FLAG_WITH_POSITION;
  489. $this->cutlen = 300;
  490. }
  491. }
  492. }
  493. // index flag
  494. if (isset($config['index']) && $this->type != self::TYPE_BODY) {
  495. $predef = 'self::FLAG_INDEX_' . strtoupper($config['index']);
  496. if (defined($predef)) {
  497. $this->flag &= ~ self::FLAG_INDEX_BOTH;
  498. $this->flag |= constant($predef);
  499. }
  500. if ($this->type == self::TYPE_ID) {
  501. $this->flag |= self::FLAG_INDEX_SELF;
  502. }
  503. }
  504. // others
  505. if (isset($config['cutlen'])) {
  506. $this->cutlen = intval($config['cutlen']);
  507. }
  508. if (isset($config['weight']) && $this->type != self::TYPE_BODY) {
  509. $this->weight = intval($config['weight']) & self::MAX_WDF;
  510. }
  511. if (isset($config['phrase'])) {
  512. if (!strcasecmp($config['phrase'], 'yes')) {
  513. $this->flag |= self::FLAG_WITH_POSITION;
  514. } elseif (!strcasecmp($config['phrase'], 'no')) {
  515. $this->flag &= ~ self::FLAG_WITH_POSITION;
  516. }
  517. }
  518. if (isset($config['non_bool'])) {
  519. if (!strcasecmp($config['non_bool'], 'yes')) {
  520. $this->flag |= self::FLAG_NON_BOOL;
  521. } elseif (!strcasecmp($config['non_bool'], 'no')) {
  522. $this->flag &= ~ self::FLAG_NON_BOOL;
  523. }
  524. }
  525. if (isset($config['tokenizer']) && $this->type != self::TYPE_ID
  526. && $config['tokenizer'] != 'default') {
  527. $this->tokenizer = $config['tokenizer'];
  528. }
  529. }
  530. }