Lucene 分词

Lucene 分词

Lucene maven导入

<properties>
        <java.version>1.8</java.version>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <maven.compiler.compilerVersion>1.8</maven.compiler.compilerVersion>
        <lucene.version>7.6.0</lucene.version>
</properties>
 <dependencies>
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>${lucene.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-common</artifactId>
            <version>${lucene.version}</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-smartcn -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-smartcn</artifactId>
            <version>${lucene.version}</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queryparser</artifactId>
            <version>${lucene.version}</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queries -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queries</artifactId>
            <version>${lucene.version}</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-highlighter -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-highlighter</artifactId>
            <version>${lucene.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers</artifactId>
            <version>3.6.2</version>
        </dependency>
</dependencies>

Lucene分词

  • StopAnalyzer(停用词分词)
    StopAnalyzer能够过滤词汇中特定字符串和词汇,并且完成大写转小写的功能。

  • StandardAnalyzer(标准分词)
    StandAnalyzer根据空格和符号来完成分词,还可以完成数字、字母、E-mail地址、IP地址以及中文字符的分析处理,还可以支持过滤词表,用来代替StopAnalyzer能够实现的功能。

  • WhitespaceAnalyzer(空格分词)
    WhitespaceAnalyzer使用空格作为间隔符的词汇分割分词器。

  • SimpleAnalyzer(简单分词)
    SimpleAnalyzer具备西文字符词汇分析的分词器,处理词汇单元是,以非字母字符作为分割符号。

  • CJKAnalyzer(二分法分词)
    内部调用CJKTokenizer分词器,对中文进行分词

  • KeywordAnalyzer(关键字分词)
    KeywordAnalyzer把整个输入作为一个单独词汇单元,方便特殊类型的文本进行索引和检索。针对邮政编码、地址等文本信息使用关键字分词器进行索引项建立非常方便。

多种分词器实例

public class VariousAnalyzers {
    private static String str = "中华人名共和国简称中国, 是一个有13亿人口的国家。";

    public static void main(String[] args) throws IOException{
        Analyzer analyzer = null;
        analyzer = new StandardAnalyzer();
        System.out.println("标准分词:" + analyzer.getClass());
        printAnalyzer(analyzer);
        analyzer = new WhitespaceAnalyzer();
        System.out.println("空格分词" + analyzer.getClass());
        printAnalyzer(analyzer);
        analyzer = new SimpleAnalyzer();
        System.out.println("简单分词" + analyzer.getClass());
        printAnalyzer(analyzer);
        analyzer = new CJKAnalyzer();
        System.out.println("二分法分词" + analyzer.getClass());
        printAnalyzer(analyzer);
        analyzer = new KeywordAnalyzer();
        System.out.println("关键字分词" + analyzer.getClass());
        printAnalyzer(analyzer);
        analyzer = new StopAnalyzer();
        System.out.println("停用词分词" + analyzer.getClass());
        printAnalyzer(analyzer);
        analyzer = new SmartChineseAnalyzer();
        System.out.println("中文分词" + analyzer.getClass());
        printAnalyzer(analyzer);

    }

    public static void printAnalyzer(Analyzer analyzer) throws IOException{
        StringReader reader = new StringReader(str);
        TokenStream tokenStream = analyzer.tokenStream(str, reader);
        tokenStream.reset();
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        while(tokenStream.incrementToken()){
            System.out.print(termAttribute.toString() + "|");
        }
        System.out.println("\n");
        analyzer.close();
    }
}

分词结果

标准分词:class org.apache.lucene.analysis.standard.StandardAnalyzer
中|华|人|名|共|和|国|简|称|中|国|是|一|个|有|13|亿|人|口|的|国|家|

空格分词class org.apache.lucene.analysis.core.WhitespaceAnalyzer
中华人名共和国简称中国,|是一个有13亿人口的国家。|

简单分词class org.apache.lucene.analysis.core.SimpleAnalyzer
中华人名共和国简称中国|是一个有|亿人口的国家|

二分法分词class org.apache.lucene.analysis.cjk.CJKAnalyzer
中华|华人|人名|名共|共和|和国|国简|简称|称中|中国|是一|一个|个有|13|亿人|人口|口的|的国|国家|

关键字分词class org.apache.lucene.analysis.core.KeywordAnalyzer
中华人名共和国简称中国, 是一个有13亿人口的国家。|

停用词分词class org.apache.lucene.analysis.core.StopAnalyzer
中华人名共和国简称中国|是一个有|亿人口的国家|

中文分词class org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer
中华|人名|共和国|简称|中国|是|一个|有|13|亿|人口|的|国家|

IK分词器配置

IKTokenizer6x

import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
public class IKTokenizer6x extends Tokenizer{
    private IKSegmenter _IKImplemenet;
    private final CharTermAttribute termAttribute;
    private final OffsetAttribute offsetAttribute;
    private final TypeAttribute typeAttribute;
    private int endPosition;

    public IKTokenizer6x(boolean useSmart){
        super();
        offsetAttribute = addAttribute(OffsetAttribute.class);
        termAttribute = addAttribute(CharTermAttribute.class);
        typeAttribute = addAttribute(TypeAttribute.class);
        _IKImplemenet = new IKSegmenter(input, useSmart);
    }

    @Override
    public boolean incrementToken() throws IOException {
        clearAttributes();
        Lexeme nextLexeme = _IKImplemenet.next();
        if(nextLexeme != null){
            termAttribute.append(nextLexeme.getLexemeText());
            termAttribute.setLength(nextLexeme.getLength());
            offsetAttribute.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
            endPosition = nextLexeme.getEndPosition();
            typeAttribute.setType(nextLexeme.getLexemeText());
            return true;
        }
        return false;
    }

    @Override
    public void reset() throws IOException{
        super.reset();
        _IKImplemenet.reset(input);
    }

    @Override
    public final void end(){
        int finalOffset = correctOffset(this.endPosition);
        offsetAttribute.setOffset(finalOffset, finalOffset);
    }
}

IKAnalyzer6x

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;

public class IKAnalyzer6x extends Analyzer{
    private boolean useSmart;

    public boolean isUseSmart(){
        return useSmart;
    }

    public IKAnalyzer6x(){
        this(false);
    }

    public IKAnalyzer6x(boolean useSmart) {
        super();
        this.useSmart = useSmart;
    }
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer _IKTokenizer = new IKTokenizer6x(this.isUseSmart());
        return new TokenStreamComponents(_IKTokenizer);
    }
}

Lucene 扩展词典

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
    <comment>IK Analyzer 扩展配置</comment>
    <!--用户可以在这里配置自己的扩展字典 -->
    <!--<entry key="ext_dict">com/org/config/my.dic;com/org/config/mine.dic;</entry>-->
    <!--用户可以在这里配置自己的扩展停止词字典-->
    <entry key="ext_stopwords">stopword.dic; ext_stopword.dic</entry>

    <entry key="ext_dict">ext.dic</entry>
</properties>

猜你喜欢

转载自blog.csdn.net/top_wind_cloud/article/details/85852604