用xapian和xappy做全文搜索
今天尝试使用xapian 和xappy 以及 pychseg 做了一下数据库的全文索引, 发现比我想象的简单.
首先介绍一下背景知识:
- xapian 是一个开源的搜索引擎. C++语言实现,并且有Perl,Python,PHP 等语言的接口. (类似java 中的lucene)
- xappy 是对xapian 的Python 接口的进一步的封装.
- pychseg 是基于python的中文分词项目。
首先下载和安装xapian. 按照 http://xapian.org/download 这个页面的说明, 安装 xapian-core 和xapian-bindings(实际上只要xapian-python 就可以了).
然后是安装xappy , 这个很简单 easy_install xappy 就可以了.
最后是pychseg .
svn checkout http://pychseg.googlecode.com/svn/trunk/ pychseg-read-only cd pychseg-read-only python setup.py install
不过这个代码有点问题, 一个有一处中间有路径使用window 的风格, 还有一处有函数没有定义. (应用一下这个patch就可以了.)
安装好以后复制xappy/examples 下面的的文件一份. 然后改一下.
下面是做索引的代码. 搜索的代码基本用原来的例子就可以了. 比我想象的简单.
#!/usr/bin/env python import sys import os import xappy from pychseg.mmseg.algorithms import SimpleAlgorithm def tokens(text): a = SimpleAlgorithm(text) return [str(w) for w in a.segment()] def create_index(dbpath): """Create a new index, and set up its field structure. """ iconn = xappy.IndexerConnection(dbpath) iconn.add_field_action('book_id', xappy.FieldActions.STORE_CONTENT) iconn.add_field_action('book_id', xappy.FieldActions.INDEX_EXACT) iconn.add_field_action('book_name', xappy.FieldActions.STORE_CONTENT) iconn.add_field_action('book_name', xappy.FieldActions.INDEX_EXACT) iconn.add_field_action('author_id', xappy.FieldActions.STORE_CONTENT) iconn.add_field_action('author_id', xappy.FieldActions.INDEX_EXACT) iconn.add_field_action('author_name', xappy.FieldActions.STORE_CONTENT) iconn.add_field_action('author_name', xappy.FieldActions.INDEX_EXACT) iconn.add_field_action('text', xappy.FieldActions.INDEX_FREETEXT) iconn.close() def open_index(dbpath): """Open an existing index. """ return xappy.IndexerConnection(dbpath) def cn_seg(text): s = " ".join(tokens(text)) return s def index_book(iconn, book): """Index a book.""" doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field('book_id', str(book['book_id']))) doc.fields.append(xappy.Field('book_name', book['book_name'])) doc.fields.append(xappy.Field('author_id', str(book['author_id']))) doc.fields.append(xappy.Field('author_name', book['author_name'])) doc.fields.append(xappy.Field('text', cn_seg(book['book_name'] + '\n' + book['author_name']))) doc.id = "book_%d" %book['book_id'] iconn.add(doc) def index_all(iconn): """Index all book.""" count = i = num_per_page = 1000 while True: #下面一句是伪代码 books = # book_db.fetchall().limit(num_per_page , i * num_per_page) for book in books: index_book(iconn, book) count += len(books) i += 1 if len(books) < num_per_page: break return count def main(argv): dbpath = 'dbindex' create_index(dbpath) iconn = open_index(dbpath) count = index_all(iconn) print "Indexed %d book." % count if __name__ == '__main__': main(sys.argv)
这个做出来的索引基本还是可以用的,但是中文有时候的分词还是不太好.
Index: pychseg/utils/myitertools.py =================================================================== --- pychseg/utils/myitertools.py (修订版 33) +++ pychseg/utils/myitertools.py (工作拷贝) @@ -23,6 +23,8 @@ #SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # +def val2cmp(x):return x + def takehighest(iterable, key, reverse=True): """ >>> takehighest(lambda x:x/2, [9,3,4,1,4,9,2,8]) Index: pychseg/mmseg/worddict.py =================================================================== --- pychseg/mmseg/worddict.py (修订版 33) +++ pychseg/mmseg/worddict.py (工作拷贝) @@ -36,8 +36,8 @@ def load_dict(): word_dict = {} load_path = pychseg.__path__[] - chars = os.path.normpath( os.path.join(load_path, ".\\wordlist\\chars.lex" ) ) - words = os.path.normpath( os.path.join(load_path, ".\\wordlist\\words.lex" ) ) + chars = os.path.normpath( os.path.join(load_path, "./wordlist/chars.lex" ) ) + words = os.path.normpath( os.path.join(load_path, "./wordlist/words.lex" ) ) logging.info("loading single chars dict") load_words(chars, word_dict, "UTF-8") logging.info("loading words dict") @@ -64,4 +64,4 @@ if __name__ == "__main__": d = load_dict() print len(d) - \ 在文件 末尾没有新行+ Index: pychseg/mmseg/config.py =================================================================== --- pychseg/mmseg/config.py (修订版 33) +++ pychseg/mmseg/config.py (工作拷贝) @@ -2,4 +2,4 @@ WORD_MAX_LENGTH = 4 -OUTPUT_CHARSET = 'gbk' \ 在文件 末尾没有新行+OUTPUT_CHARSET = 'utf-8'
(发现这个wp-syntax/genshi 还支持diff 格式, 很赞.)