Indexer.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. #!/usr/bin/env php
  2. <?php
  3. /**
  4. * Xunsearch PHP-SDK 索引管理工具
  5. *
  6. * @author hightman
  7. * @link http://www.xunsearch.com/
  8. * @copyright Copyright &copy; 2011 HangZhou YunSheng Network Technology Co., Ltd.
  9. * @license http://www.xunsearch.com/license/
  10. * @version $Id$
  11. */
  12. $lib_file = dirname(__FILE__) . '/../lib/XS.php';
  13. if (!file_exists($lib_file)) {
  14. $lib_file = dirname(__FILE__) . '/../lib/XS.class.php';
  15. }
  16. require_once $lib_file;
  17. require_once dirname(__FILE__) . '/XSUtil.class.php';
  18. require_once dirname(__FILE__) . '/XSDataSource.class.php';
  19. // check arguments
  20. //ini_set('memory_limit', '1024M');
  21. XSUtil::parseOpt(array('p', 'c', 'd', 'project', 'charset', 'db', 'source', 'file', 'sql', 'csv-delimiter', 'add-synonym', 'del-synonym'));
  22. $project = XSUtil::getOpt('p', 'project', true);
  23. // magick output charset
  24. $charset = XSUtil::getOpt('c', 'charset');
  25. XSUtil::setCharset($charset);
  26. // long options
  27. $params = array('source', 'file', 'sql', 'rebuild', 'clean', 'flush', 'flush-log', 'info', 'csv-delimiter', 'filter');
  28. $params[] = 'add-synonym';
  29. $params[] = 'del-synonym';
  30. $params[] = 'stop-rebuild';
  31. $params[] = 'custom-dict';
  32. foreach ($params as $_) {
  33. $k = strtr($_, '-', '_');
  34. $$k = XSUtil::getOpt(null, $_);
  35. }
  36. // file & database
  37. $file = XSUtil::getOpt(null, 'file', true);
  38. $db = XSUtil::getOpt('d', 'db');
  39. $scws_multi = XSUtil::getOpt(null, 'scws-multi');
  40. // help message
  41. if (XSUtil::getOpt('h', 'help') !== null || !is_string($project)
  42. || (!$custom_dict && !$stop_rebuild && !$flush && !$flush_log
  43. && !$info && !$clean && !$source && !$add_synonym && !$del_synonym && !$scws_multi)) {
  44. $version = XS_PACKAGE_NAME . '/' . XS_PACKAGE_VERSION;
  45. echo <<<EOF
  46. Indexer - 索引批量管理、导入工具 ($version)
  47. 用法
  48. {$_SERVER['argv'][0]} [options] [-p|--project] <project> [--file] <file>
  49. 选项说明
  50. --project=<name|ini>
  51. -p <project> 用于指定要搜索的项目名称或项目配置文件的路径,
  52. 如果指定的是名称,则使用 ../app/<name>.ini 作为配置文件
  53. --charset=<gbk|utf-8>
  54. -c <charset> 指定您当前在用以及数据源的字符集,以便系统进行智能转换(默认:UTF-8)
  55. --db=<name>
  56. -d <db> 指定项目中的数据库名称,默认是名为 db 的库
  57. --source=mysql://[user[:passwd]@]host/dbname[/table]
  58. 指定数据源为 mysql
  59. --source=sqlite:[//]<dbpath>|sqlite3:[//]<dbpath>
  60. 指定数据源为 sqlite 或 sqlite3
  61. --source=json指定数据源为 json 格式,每行一条记录
  62. --source=csv 指定数据源为 csv 格式,逗号分隔字段,每行一条记录,可在首行指定字段名
  63. --csv-delimiter[=,] 指定 csv 数据源的字段分割符,默认为逗号,支持 \\t\\r\\n..\\xNN
  64. 使用 \\ 开头及其它与 shell 有岐议的分割符时请使用引号包围。
  65. --file=<file>当数据源为 json 或 csv 格式时指定数据源文件,默认读取标准输入
  66. --sql=<sql> 当数据源为 sql 类型时指定 sql 搜索语句,默认情况下,
  67. 如果在 --source 包含 table 则载入该表数据。
  68. 警告:请勿在 SQL 语句中包含 `` 反引号,这在 SHELL 中有特殊函义可能会出错
  69. --filter <name|path>
  70. 指定数据过滤器,可为内置的 debug 或自定义的过滤器文件路径(不包含 .php)
  71. 过滤器必须实现接口 XSDataFilter
  72. --add-synonym=<raw1:synonym1[,raw2:synonym2]...>
  73. 添加一个或多个同义词, 多个之间用半角逗号分隔, 原词和同义词之间用冒号分隔
  74. --del-synonym=<raw1[:synonym1[,raw2[:synonym2]]]...>
  75. 删除一个或多个同义词, 多个之间用半角逗号分隔, 原词和同义词之间用冒号分隔
  76. 省略同义词则表示删除该原词的所有同义词
  77. --scws-multi[=level]
  78. 查看或设置搜索语句的 scws 复合分词等级(值:0-15,默认为 3)
  79. --rebuild 使用平滑重建方式导入数据,必须与 --source 配合使用
  80. --stop-rebuild 强制中止没未完成的索引重建状态 (慎用)
  81. --clean 清空库内当前的索引数据
  82. --custom-dict 读取/设置项目自定义词库,默认为读取,配合 --file 指定文件去设置词库
  83. --flush 强制提交刷新索引服务端的缓冲索引,与 --source 分开用
  84. --flush-log 强制提交刷新搜索日志,与 --source 分开用
  85. --info 查看当前索引库在服务端的信息(含服务端信息、数据缓冲、运行进程等)
  86. -h|--help 显示帮助信息
  87. EOF;
  88. exit(0);
  89. }
  90. // create xs project
  91. $ini = XSUtil::toProjectIni($project);
  92. if (!file_exists($ini)) {
  93. echo "错误:无效的项目名称 ($project),不存在相应的配置文件。\n";
  94. exit(-1);
  95. }
  96. // csv delimiter saved in super global variable: _SERVER
  97. if (is_string($csv_delimiter) && $source == 'csv') {
  98. if (substr($csv_delimiter, 0, 1) !== '\\') {
  99. $csv_delimiter = substr($csv_delimiter, 0, 1);
  100. } else {
  101. $char = substr($csv_delimiter, 1, 1);
  102. switch ($char) {
  103. case '\\':
  104. $csv_delimiter = '\\';
  105. break;
  106. case 't':
  107. $csv_delimiter = "\t";
  108. break;
  109. case 'x':
  110. $csv_delimiter = chr(hexdec(substr($csv_delimiter, 2)));
  111. break;
  112. default:
  113. $csv_delimiter = ',';
  114. break;
  115. }
  116. }
  117. $_SERVER['XS_CSV_DELIMITER'] = $csv_delimiter;
  118. printf("注意:CSV 字段分割符被修改为 `%c` (ASCII: 0x%02x)\n", ord($csv_delimiter), ord($csv_delimiter));
  119. }
  120. // filter
  121. if ($filter !== null && is_string($filter)) {
  122. $original = $filter;
  123. $class = 'XS' . ucfirst(strtolower($filter)) . 'Filter';
  124. if (class_exists($class)) {
  125. $filter = new $class;
  126. } else {
  127. if (file_exists($filter . '.php')) {
  128. $class = basename($filter);
  129. require_once $filter . '.php';
  130. if (class_exists($class)) {
  131. $filter = new $class;
  132. }
  133. }
  134. }
  135. if (!is_object($filter) || !($filter instanceof XSDataFilter)) {
  136. $filter = null;
  137. echo "注意:自动忽略无效的过滤器 [" . $original . "]\n";
  138. }
  139. }
  140. // execute the indexer
  141. try {
  142. // create xs object
  143. $xs = new XS($ini);
  144. $index = $xs->index;
  145. if ($db !== null) {
  146. $index->setDb($db);
  147. }
  148. // scws multi
  149. if ($scws_multi !== null && $scws_multi !== true) {
  150. $index->setScwsMulti($scws_multi);
  151. if (!empty($source)) {
  152. $scws_multi = null;
  153. }
  154. }
  155. // special actions
  156. if ($info !== null) {
  157. echo "---------- SERVER INFO BEGIN ----------\n";
  158. $res = $index->execCommand(XS_CMD_DEBUG);
  159. echo $res->buf;
  160. echo "\n---------- SERVER INFO END ----------\n";
  161. $res = $index->execCommand(XS_CMD_INDEX_GET_DB);
  162. $res = json_decode($res->buf);
  163. echo "数据库名:" . sprintf('%s[0x%04x]', $res->name, $res->flag) . "\n";
  164. echo "队列数据:" . $res->count . "条\n";
  165. echo "导入进程:" . ($res->pid > 0 ? '#' . $res->pid : '无') . "\n";
  166. } elseif ($flush_log !== null) {
  167. echo "刷新搜索日志 ... \n";
  168. if (($res = $index->flushLogging()) === false) {
  169. echo "失败\n";
  170. } else {
  171. echo "成功,注意:后台更新需要一些时间,并不是真正立即完成。\n";
  172. }
  173. } elseif ($flush !== null) {
  174. echo "刷新索引缓冲 ... \n";
  175. if (($res = $index->flushIndex()) === false) {
  176. echo "失败\n";
  177. } else {
  178. echo "成功,注意:后台更新需要一些时间,并不是真正立即完成。\n";
  179. }
  180. } elseif ($custom_dict !== null) {
  181. if ($file === null) {
  182. $content = $index->getCustomDict();
  183. if ($content === '') {
  184. echo "注意:该项目无自定义词库或内容为空!";
  185. } else {
  186. if (substr($content, 0, 1) !== '#') {
  187. echo "# WORD\tTF\tIDF\tATTR\n";
  188. }
  189. echo $content;
  190. }
  191. echo "\n";
  192. } else {
  193. if ($file === true || !file_exists($file)) {
  194. echo "错误:请正确指定要替换的自定义词库文件路径 (" . strval($file) . ")\n";
  195. } else {
  196. $content = file_get_contents($file);
  197. echo "正在提交自定义词库 (" . number_format(strlen($content)) . " bytes) ... ";
  198. $index->setCustomDict($content);
  199. echo "OK\n";
  200. }
  201. }
  202. } elseif ($scws_multi !== null) {
  203. $level = $index->getScwsMulti();
  204. echo "当前索引库的 scws 复合分词等级为:$level\n";
  205. } else {
  206. // clean
  207. if ($clean !== null) {
  208. echo "清空现有索引数据 ...\n";
  209. $index->clean();
  210. }
  211. // stop rebuild
  212. if ($stop_rebuild !== null) {
  213. echo "中止索引重建 ...\n";
  214. $index->stopRebuild();
  215. }
  216. // begin rebuild
  217. if ($rebuild !== null) {
  218. echo "开始重建索引 ...\n";
  219. $index->beginRebuild();
  220. }
  221. // import data from source
  222. $fid = $xs->getFieldId();
  223. if (!empty($source)) {
  224. echo "初始化数据源 ... $source \n";
  225. $total = $total_ok = $total_failed = 0;
  226. $src = XSDataSource::instance($source, strpos($source, ':') ? $sql : $file);
  227. $dcs = $src->getCharset();
  228. if ($dcs === false) {
  229. $dcs = $charset === null ? 'UTF-8' : $charset;
  230. }
  231. echo "开始批量导入数据 (" . (empty($file) ? "请直接输入数据" : $file) . ") ...\n";
  232. XSUtil::flush();
  233. $index->setTimeout(0);
  234. $index->openBuffer();
  235. while ($data = $src->getData()) {
  236. $doc = new XSDocument($dcs);
  237. if ($source == 'csv') {
  238. $data = csvTransform($data);
  239. if (is_null($data)) {
  240. continue;
  241. }
  242. }
  243. $pk = $data[$fid->name];
  244. if ($filter !== null && ($data = $filter->process($data, $dcs)) === false) {
  245. $total++;
  246. echo "警告:过滤器忽略了第 $total 条数据, 主键为:" . $pk . "\n";
  247. continue;
  248. }
  249. $doc->setFields($data);
  250. try {
  251. if ($filter !== null && method_exists($filter, 'processDoc')) {
  252. $filter->processDoc($doc);
  253. }
  254. $total++;
  255. $index->update($doc);
  256. $total_ok++;
  257. } catch (XSException $e) {
  258. echo "警告:添加第 $total 条数据失败 - " . $e->getMessage() . "\n";
  259. echo $e->getTraceAsString();
  260. $total_failed++;
  261. }
  262. if (($total % 10000) == 0) {
  263. echo "报告:累计已处理数据 $total 条 ...\n";
  264. }
  265. }
  266. $index->closeBuffer();
  267. echo "完成索引导入:成功 $total_ok 条,失败 $total_failed 条\n";
  268. }
  269. // add synonyms
  270. if (is_string($add_synonym)) {
  271. $rec = array();
  272. foreach (explode(",", $add_synonym) as $tmp) {
  273. if (strpos($tmp, ':') === false) {
  274. continue;
  275. }
  276. list($raw, $syn) = explode(':', $tmp, 2);
  277. $raw = trim($raw);
  278. $syn = trim($syn);
  279. if ($raw !== '' && $syn !== '') {
  280. $rec[] = array($raw, $syn);
  281. }
  282. }
  283. echo "报告:开始添加同义词记录 " . count($rec) . " 条...\n";
  284. if (count($rec) > 1) {
  285. $index->openBuffer();
  286. }
  287. foreach ($rec as $tmp) {
  288. $index->addSynonym($tmp[0], $tmp[1]);
  289. }
  290. if (count($rec) > 1) {
  291. $index->closeBuffer();
  292. }
  293. }
  294. // del synonyms
  295. if (is_string($del_synonym)) {
  296. $rec = array();
  297. foreach (explode(",", $del_synonym) as $tmp) {
  298. $syn = '';
  299. if (strpos($tmp, ':') === false) {
  300. $raw = trim($tmp);
  301. } else {
  302. list($raw, $syn) = explode(':', $tmp, 2);
  303. $raw = trim($raw);
  304. $syn = trim($syn);
  305. }
  306. if ($raw !== '') {
  307. $rec[] = array($raw, $syn);
  308. }
  309. }
  310. echo "报告:开始删除同义词记录 " . count($rec) . " 条...\n";
  311. if (count($rec) > 1) {
  312. $index->openBuffer();
  313. }
  314. foreach ($rec as $tmp) {
  315. $index->delSynonym($tmp[0], $tmp[1]);
  316. }
  317. if (count($rec) > 1) {
  318. $index->closeBuffer();
  319. }
  320. }
  321. // end rebuild
  322. if ($rebuild !== null) {
  323. echo "完成重建索引 ...\n";
  324. $index->endRebuild();
  325. } else {
  326. echo "刷新索引提交 ...\n";
  327. $index->flushIndex();
  328. }
  329. }
  330. } catch (XSException $e) {
  331. // Exception
  332. $start = dirname(dirname(__FILE__));
  333. $relative = XSException::getRelPath($start);
  334. $traceString = $e->getTraceAsString();
  335. $traceString = str_replace(dirname(__FILE__) . '/', '', $traceString);
  336. $traceString = str_replace($start . ($relative === '' ? '/' : ''), $relative, $traceString);
  337. echo $e . "\n" . $traceString . "\n";
  338. }
  339. // translate csv data
  340. function csvTransform($data)
  341. {
  342. static $fields = null;
  343. global $xs; /* @var $xs XS */
  344. // init field set
  345. if (is_null($fields)) {
  346. // load default fields
  347. $fields = array_keys($xs->getScheme()->getAllFields());
  348. // check data is fieldset or not
  349. $is_header = true;
  350. foreach ($data as $tmp) {
  351. if (!in_array($tmp, $fields)) {
  352. $is_header = false;
  353. break;
  354. }
  355. }
  356. if ($is_header) {
  357. $fields = $data;
  358. echo "注意:CSV 数据字段被指定为:" . implode(',', $data) . "\n";
  359. return null;
  360. }
  361. }
  362. // transform
  363. $ret = array();
  364. foreach ($fields as $key) {
  365. $index = count($ret);
  366. if (!isset($data[$index])) {
  367. break;
  368. }
  369. $ret[$key] = $data[$index];
  370. }
  371. return $ret;
  372. }