用xapian和xappy做全文搜索

今天尝试使用xapian 和xappy 以及 pychseg 做了一下数据库的全文索引, 发现比我想象的简单.

首先介绍一下背景知识:

xapian 是一个开源的搜索引擎. C++语言实现,并且有Perl,Python,PHP 等语言的接口. (类似java 中的lucene)
xappy 是对xapian 的Python 接口的进一步的封装.
pychseg 是基于python的中文分词项目。

首先下载和安装xapian. 按照 http://xapian.org/download 这个页面的说明, 安装 xapian-core 和xapian-bindings(实际上只要xapian-python 就可以了).

然后是安装xappy , 这个很简单 easy_install xappy 就可以了.

最后是pychseg .

svn checkout http://pychseg.googlecode.com/svn/trunk/ pychseg-read-only
cd pychseg-read-only
python setup.py install

不过这个代码有点问题, 一个有一处中间有路径使用window 的风格, 还有一处有函数没有定义. (应用一下这个patch就可以了.)

安装好以后复制xappy/examples 下面的的文件一份. 然后改一下.
下面是做索引的代码. 搜索的代码基本用原来的例子就可以了. 比我想象的简单.

#!/usr/bin/env python
import sys
import os
import xappy
from  pychseg.mmseg.algorithms import SimpleAlgorithm 
 
def tokens(text):
    a = SimpleAlgorithm(text)
    return [str(w) for w in a.segment()]
 
def create_index(dbpath):
    """Create a new index, and set up its field structure.
    """
    iconn = xappy.IndexerConnection(dbpath)
    iconn.add_field_action('book_id', xappy.FieldActions.STORE_CONTENT)
    iconn.add_field_action('book_id', xappy.FieldActions.INDEX_EXACT)
    iconn.add_field_action('book_name', xappy.FieldActions.STORE_CONTENT)
    iconn.add_field_action('book_name', xappy.FieldActions.INDEX_EXACT)
    iconn.add_field_action('author_id', xappy.FieldActions.STORE_CONTENT)
    iconn.add_field_action('author_id', xappy.FieldActions.INDEX_EXACT)
    iconn.add_field_action('author_name', xappy.FieldActions.STORE_CONTENT)
    iconn.add_field_action('author_name', xappy.FieldActions.INDEX_EXACT)
    iconn.add_field_action('text', xappy.FieldActions.INDEX_FREETEXT)
    iconn.close()
 
def open_index(dbpath):
    """Open an existing index.
    """
    return xappy.IndexerConnection(dbpath)
 
def cn_seg(text):
    s =  " ".join(tokens(text))
    return s
 
def index_book(iconn, book):
    """Index a book."""
    doc = xappy.UnprocessedDocument()
    doc.fields.append(xappy.Field('book_id', str(book['book_id'])))
    doc.fields.append(xappy.Field('book_name', book['book_name']))
    doc.fields.append(xappy.Field('author_id', str(book['author_id'])))
    doc.fields.append(xappy.Field('author_name', book['author_name']))
    doc.fields.append(xappy.Field('text', cn_seg(book['book_name'] + '\n' +  book['author_name'])))
    doc.id = "book_%d" %book['book_id']
    iconn.add(doc)
 
def index_all(iconn):
    """Index all book."""
    count = 
    i = 
    num_per_page = 1000
    while True:
        #下面一句是伪代码
        books = # book_db.fetchall().limit(num_per_page , i * num_per_page)
        for book in books:
            index_book(iconn, book)
        count += len(books)
        i += 1
        if len(books) < num_per_page:
            break
    return count
 
def main(argv):
    dbpath = 'dbindex'
    create_index(dbpath)
    iconn = open_index(dbpath)
    count = index_all(iconn)
    print "Indexed %d book." % count
 
if __name__ == '__main__':
    main(sys.argv)

这个做出来的索引基本还是可以用的,但是中文有时候的分词还是不太好.

附件:pychseg_path

Index: pychseg/utils/myitertools.py
===================================================================
--- pychseg/utils/myitertools.py        （修订版 33）
+++ pychseg/utils/myitertools.py        （工作拷贝）
@@ -23,6 +23,8 @@
 #SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
+def val2cmp(x):return x
+
 def takehighest(iterable, key, reverse=True):
     """
     >>> takehighest(lambda x:x/2, [9,3,4,1,4,9,2,8])
Index: pychseg/mmseg/worddict.py
===================================================================
--- pychseg/mmseg/worddict.py   （修订版 33）
+++ pychseg/mmseg/worddict.py   （工作拷贝）
@@ -36,8 +36,8 @@
 def load_dict():
     word_dict = {}
     load_path = pychseg.__path__[]
-    chars = os.path.normpath( os.path.join(load_path, ".\\wordlist\\chars.lex" ) )
-    words = os.path.normpath( os.path.join(load_path, ".\\wordlist\\words.lex" ) )
+    chars = os.path.normpath( os.path.join(load_path, "./wordlist/chars.lex" ) )
+    words = os.path.normpath( os.path.join(load_path, "./wordlist/words.lex" ) )
     logging.info("loading single chars dict")
     load_words(chars, word_dict, "UTF-8")
     logging.info("loading words dict")
@@ -64,4 +64,4 @@
 if __name__ == "__main__":
     d = load_dict()
     print len(d)
-
\ 在文件
 末尾没有新行+
Index: pychseg/mmseg/config.py
===================================================================
--- pychseg/mmseg/config.py     （修订版 33）
+++ pychseg/mmseg/config.py     （工作拷贝）
@@ -2,4 +2,4 @@
 
 WORD_MAX_LENGTH = 4
 
-OUTPUT_CHARSET = 'gbk'
\ 在文件
 末尾没有新行+OUTPUT_CHARSET = 'utf-8'

(发现这个wp-syntax/genshi 还支持diff 格式, 很赞.)