PHP - coreseek使用教程

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/wyansai/article/details/81814411

coreseek使用教程

前提依赖

  • ubuntu 16.04
  • coreseek 4.1 下载
  • libiconv 下载
  • 必备的其他依赖包:
sudo apt install automake libtool libxml2-dev libexpat1-dev
sudo apt install mysql-client mysql-server libmysqlclient-dev
sudo apt install python-dev

安装mmseg分词工具

cd mmseg-3.2.14
./bootstrap
./configure --prefix=/usr/local/mmseg3
sudo make && sudo make install

安装libiconv 或者安装1.13.1

wget -O - http://blog.atime.me/static/resource/libiconv-glibc-2.16.patch.gz | gzip -d - | patch -p0
cd libiconv-1.14
./configure
make && sudo make install
sudo ldconfig

安装csft

  • configure.ac中去掉AM_INIT_AUTOMAKE里的-Werror
  • buildconf.sh中automake后加入 –add-missing
wget -O - http://blog.atime.me/static/resource/sphinxexpr-gcc4.7.patch.gz | gzip -d - | patch -p0
sh buildconf.sh
./configure --prefix=/usr/local/coreseek  --without-unixodbc --with-mmseg --with-mmseg-includes=/usr/local/mmseg3/include/mmseg/ --with-mmseg-libs=/usr/local/mmseg3/lib/ --with-mysql --with-python LIBS=-liconv
make && sudo make install

安装libsphinxclient

sh buildconf.sh
./configure
make && sudo make install

安装sphinx扩展

  • 下载源码点击Browse Source,在最下面找到支持php7的源码
./configure 
make && sudo make install

centos下安装教程

  • 安装环境
yum install make gcc gcc++ gcc-c++ libtool autoconf automake imake mysql-devel libxml2-devel expat-devel python-dev
  • 安装mmseg
cd mmseg-3.2.14
./configure --prefix=/usr/local/mmseg3
make && sudo make install
  • 安装csft
    configure.ac中去掉AM_INIT_AUTOMAKE里的-Werror
    buildconf.sh中automake后加入 –add-missing
    修改src/sphinxexpr.cpp,ExprEval 改为 this->ExprEval
sh buildconf.sh
./configure --prefix=/usr/local/coreseek  --without-unixodbc --with-mmseg --with-mmseg-includes=/usr/local/mmseg3/include/mmseg/ --with-mmseg-libs=/usr/local/mmseg3/lib/ --with-mysql --with-python
make && sudo make install
  • 安装libsphinxclient
sh buildconf.sh
./configure
make && sudo make install
  • 安装sphinx扩展
    下载源码点击Browse Source,在最下面找到支持php7的源码
./configure 
make && sudo make install

python 数据源设置

  • conf参数如下
python
{
    path = /usr/local/coreseek/etc/pysource 
    path = /usr/local/coreseek/etc/pysource/csft_demo 
}

#源定义
source python
{
    type = python
    name = csft_demo.MainSource
}

#index定义
index python
{
    source           = python 
    path             = /usr/local/coreseek/var/data/python 
    docinfo          = extern
    mlock            = 0
    morphology       = none
    min_word_len     = 1
    html_strip       = 0

    charset_dictpath = /usr/local/mmseg3/etc
    charset_type     = zh_cn.utf-8
}

#全局index定义
indexer
{
    mem_limit            = 128M
}

#searchd服务定义
searchd
{
    listen          = 9312
    read_timeout    = 5
    max_children    = 30
    max_matches     = 1000
    seamless_rotate = 0
    preopen_indexes = 0
    unlink_old      = 1
    pid_file    = /usr/local/coreseek/var/log/searchd_python.pid
    log         = /usr/local/coreseek/var/log/searchd_python.log
    query_log   = /usr/local/coreseek/var/log/query_python.log
    binlog_path = 
}
  • python脚本 内容如下
# -*- coding:utf-8 -*-
# coreseek3.2 python source演示
# author: HonestQiao
# date: 2010-06-03 11:46

class MainSource(object):
    def __init__(self, conf):
        self.conf =  conf
        self.idx  = 0
        self.data = [
            {'id':1, 'subject':u"愚人节最佳蛊惑爆料 谷歌300亿美元收购百度", 'context':u'1111', 'published':1270131607, 'author_id':1},
            {'id':2, 'subject':u'Twitter主页改版 推普通用户消息增加趋势话题', 'context':u'2222', 'published':1270135548, 'author_id':1},
            {'id':3, 'subject':u'死都要上!Opera Mini 体验版抢先试用', 'context':u'3333', 'published':1270094460, 'author_id':2},
        ]

    def GetScheme(self):  #获取结构,docid、文本、整数
        return [
            ('id' , {'docid':True, } ),
            ('subject', { 'type':'text'} ),
            ('context', { 'type':'text'} ),
            ('published', {'type':'integer'} ),
            ('author_id', {'type':'integer'} ),
        ]

    def GetFieldOrder(self): #字段的优先顺序
        return [('subject')]

    def Connected(self):   #如果是数据库,则在此处做数据库连接
        pass

    def NextDocument(self, err):   #取得每一个文档记录的调用
        if self.idx < len(self.data):
            item           = self.data[self.idx]
            self.docid     = self.id = item['id'] #'docid':True
            self.subject   = item['subject'].encode('utf-8')
            self.context   = item['context'].encode('utf-8')
            self.published = item['published']
            self.author_id = item['author_id']
            self.idx       += 1
            return True
        else:
            return False

if __name__ == "__main__":    #直接访问演示部分
    conf = {}
    source = MainSource(conf)
    source.Connected()

    while source.NextDocument():
        print "id=%d, subject=%s" % (source.docid, source.subject)
    pass
#eof

猜你喜欢

转载自blog.csdn.net/wyansai/article/details/81814411