1、给Discuz帖子、文章标题创建索引
2、输入一个关键词,返回与这个关键词相关的帖子和文章
# coding=utf-8 import os,json,time from whoosh.index import create_in from whoosh.fields import * from jieba.analyse import ChineseAnalyzer from whoosh.qparser import QueryParser from whoosh import qparser, scoring from whoosh import index import MySQLdb as mdb reload(sys) sys.setdefaultencoding('utf8') start = time.time() con = mdb.connect('127.0.0.1','root','','wddis',charset='utf8',unix_socket='/tmp/mysql.sock') ''' 表信息: pre_forum_thread.subject&tid --> 存放帖子标题 pre_portal_article_title.title&aid--> 存放文章标题 ''' def new_index_sql(): # 按照schema定义信息,增加需要建立索引的文档 # 注意:字符串格式需要为unicode格式 writer = ix.writer(limitmb=256,procs=4) cur = con.cursor() n = 0 a = cur.execute("select count(*) from pre_forum_thread ") number = int(cur.fetchone()[0]) with con: cur.execute("select tid,fid,subject from pre_forum_thread") numrows = int(cur.rowcount) for i in range(numrows): row = cur.fetchone() tid = row[0] fid = row[1] title = row[2] writer.add_document(title=title, tid=tid, fid=fid) n += 1 percent=float(n)*100/float(number) sys.stdout.write("-----------> 完成百分比:%.2f" % percent) sys.stdout.write("%\r") sys.stdout.flush() writer.commit() sys.stdout.flush() def search_index(words): with ix.searcher() as s: # group=qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才能出结果 qp = QueryParser('title',schema=ix.schema,group=qparser.OrGroup) # 下面两行表示可以使用通配符搜索,如”窗前*月光“ qp.remove_plugin_class(qparser.WildcardPlugin) qp.add_plugin(qparser.PrefixPlugin()) for word in words: q = qp.parse(u'%s' % word) results = s.search(q,limit=20) for i in results: #print word + "----->" + i["title"],i.highlights("title"),i.score print word,i['title'],i['tid'],i['fid'] # 使用结巴中文分词 analyzer = ChineseAnalyzer() # 创建schema, stored为True表示能够被检索 schema = Schema( title=TEXT(stored=True, analyzer=analyzer), tid=NUMERIC(stored=True), fid=NUMERIC(stored=True), ) # 存储schema信息至'798wd_luntan'目录下 indexdir = '798wd_luntan/' if not os.path.exists(indexdir): os.mkdir(indexdir) try: ix=index.open_dir(indexdir) print '>>>>>>>> 已创建索引 <<<<<<<<<<' except: print '>>>>>>>> 未创建索引 <<<<<<<<<<' ix = create_in(indexdir, schema) new_index_sql() words = ["容易下款的高炮口子"] search_index(words) end = time.time() print "完成时间: %f s" % (end - start)
转载请注明:思享SEO博客 » whoosh为discuz帖子和文章标题创建索引