转载经典的分词方法实现（JAVA)

基于规则的自动分词算法

原理

(1) 事先人工建立好分词词典和分词规则库。
(2) 原理为基于字符串匹配进行分词,这样就要求有足够大的词表为依据。
(3) 通过一定的算法来实现,如正向最大匹配法、逆向最大匹配法、双向匹配法等。
(4) 忧缺点:当分词词典所收容的词较少时,显然覆盖度就有限,分词的正确率就低。

正向最大匹配法

算法描述

设MaxLen表示最大词长,D为分词词典
(1) 从待切分语料中按正向取长度为MaxLen的字串str,令Len=MaxLen;
(2) 把str与D中的词相匹配;
(3) 若匹配成功,则认为该字串为词,指向待切分语料的指针向前移Len个汉字(字节),返回到(1);
(4) 若不成功:如果Len>1,则将Len减2,从待切分语料中取长度为Len的字串str,返回到(2)。否则,得到长度为2的单字词,指向待切分语料的指针向前移1个汉字,返回(1)。

算法代码

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author quincy1994
 */
public class Nlp {

    private String m_sResult = ""; // 切分后的结果串
    private int m_nPosIndex;  // 指向待切分语料的指针的具体位置
    private int m_MaxLen; // 最大取词长
    private int totalMaxLen; //总最大取词长
    private Set<String> dictionary; // 分词字典

    public Nlp(int maxLen){
        this.m_MaxLen = maxLen;
        this.m_nPosIndex = 0;
        this.totalMaxLen = maxLen;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Nlp(){
        this.m_MaxLen = 3;
        this.totalMaxLen = 3;
        this.m_nPosIndex = 0;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Set<String> loadFile() throws FileNotFoundException, IOException{
        //读取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "dict.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while( ( tmp = br.readLine() )!=null){
            String[] token = tmp.split(",");
            String word = token[0];
            dictionary.add(word);
        }
        return dictionary;
    }
    public String MMSegment(String source){
         int len = totalMaxLen;
         int frompos = 0;
         MM(source, len, frompos);
         return m_sResult;
     }
    public String getSubString(String source, int m_nPosIndex, int len){
        int endIndex = m_nPosIndex + len;
        int length = source.length();

        //需要判断是否超出句子边界
        while(endIndex > length){
            endIndex -= 1;
        }
        String sub = source.substring(m_nPosIndex, endIndex);
        return sub;
    }
    public void MM(String source, int len , int frompos){

        //递归匹配
         if (m_nPosIndex >= source.length()) return;
        String sub = getSubString(source, m_nPosIndex,len);
        if(dictionary.contains(sub)){
            //匹配
            m_sResult += sub + "/ ";
            m_nPosIndex = m_nPosIndex + m_MaxLen;
            m_MaxLen = totalMaxLen;
            MM(source, m_MaxLen, m_nPosIndex);
        }
        else{
            //不匹配
            if(m_MaxLen > 1){
                m_MaxLen = m_MaxLen - 1;
                MM(source, m_MaxLen, m_nPosIndex);
            }
            else{
                m_sResult += sub+ "/ ";
                m_nPosIndex  += 1;
                m_MaxLen = totalMaxLen;
                MM(source, m_MaxLen, m_nPosIndex);
            }
    }
}
    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        // TODO code application logic here
        Nlp nlp = new Nlp();
        String source = "今天天气不错！";
        String result = nlp.MMSegment(source);
        System.out.println(result);
    } 
}

  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
   
   52
   
   53
   
   54
   
   55
   
   56
   
   57
   
   58
   
   59
   
   60
   
   61
   
   62
   
   63
   
   64
   
   65
   
   66
   
   67
   
   68
   
   69
   
   70
   
   71
   
   72
   
   73
   
   74
   
   75
   
   76
   
   77
   
   78
   
   79
   
   80
   
   81
   
   82
   
   83
   
   84
   
   85
   
   86
   
   87
   
   88
   
   89
   
   90
   
   91
   
   92
   
   93
   
   94
   
   95
   
   96
   
   97
   
   98
   
   99
   
   100
   
   101
   
   102
   
   103
   
   104
   
   105
   
   106
   
   107
   
   108
   
   109
   
   110
   
   111
   
   112
   
   113
   
   114
   
   115
   
   116

逆向最大匹配法

算法描述

与正向最大匹配法原理一样，只是匹配的开始为句尾

代码实现

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author quincy1994
 */
public class RMM {
    private String m_sResult = "";         //切分后的结果串
    private int m_nPosIndex;                //游标指针
    private int m_MaxLen;                    //最大取词长
    private int totalMaxlen;                //总最大取词长
    private Set<String> dictionary;      //分词字典

    public RMM(int maxLen){
        this.m_MaxLen = maxLen;
        this.totalMaxlen = maxLen;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public RMM(){
        this.m_MaxLen = 3;
        this.totalMaxlen = 3;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Set<String> loadFile() throws IOException{

        //读取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "dict.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while((tmp=br.readLine())!= null){
            String[] token = tmp.split(",");
            String word = token[0];
            dictionary.add(word);
        }
        return dictionary;
    }
    public String RMMSegment(String source){
        int len= totalMaxlen;
        this.m_nPosIndex = source.length();
        int frompos = this.m_nPosIndex;
        rmm(source, m_MaxLen, m_nPosIndex);

        //将结果按顺序输出
        String[] token = m_sResult.split("/");
        String result = "";
        for(int i = token.length-1; i > 0 ; i--){
            result += token[i] + "/ ";
        }
        return result;
    }
    public String getSubString(String source, int m_nPosIndex, int len){

        int startIndex = m_nPosIndex - len;
        //判断越界条件
        while(startIndex < 0){
            startIndex += 1;
        }
        String sub = source.substring(startIndex, m_nPosIndex);
        return sub;
    }

    public void rmm(String source, int len, int frompos){
         if(m_nPosIndex < 0)  return;
         String sub = getSubString(source, m_nPosIndex, len);
         if(dictionary.contains(sub)){
             //匹配成功
             m_sResult += "/" + sub ;
             m_nPosIndex = m_nPosIndex - m_MaxLen;
             m_MaxLen = totalMaxlen;
             rmm(source, m_MaxLen, m_nPosIndex);
         }
         else{
             //不匹配
             if(m_MaxLen > 1){
                 m_MaxLen = m_MaxLen - 1;
                 rmm(source, m_MaxLen, m_nPosIndex);
             }
             else{
                 m_sResult += "/" + sub ;
                 m_nPosIndex -= 1;
                 m_MaxLen = totalMaxlen;
                 rmm(source, m_MaxLen, m_nPosIndex);
            }
        }
    }
    public static void main(String[] args) {
        // TODO code application logic here
        RMM myRMM = new RMM();
        String source = "记录最佳前候选词列表";
        String result = myRMM.RMMSegment(source);
        System.out.println(result);
    } 
}

  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
   
   52
   
   53
   
   54
   
   55
   
   56
   
   57
   
   58
   
   59
   
   60
   
   61
   
   62
   
   63
   
   64
   
   65
   
   66
   
   67
   
   68
   
   69
   
   70
   
   71
   
   72
   
   73
   
   74
   
   75
   
   76
   
   77
   
   78
   
   79
   
   80
   
   81
   
   82
   
   83
   
   84
   
   85
   
   86
   
   87
   
   88
   
   89
   
   90
   
   91
   
   92
   
   93
   
   94
   
   95
   
   96
   
   97
   
   98
   
   99
   
   100
   
   101
   
   102
   
   103
   
   104
   
   105
   
   106
   
   107
   
   108
   
   109
   
   110
   
   111
   
   112
   
   113
   
   114
   
   115
   
   116

基于统计的中文分词算法

基本思想

选择概率最大的分词路径作为最优结果
利用动态规划算法来实现,即最优路径中的第i个词w i 的累计概率等于它的左相邻词w i-1 的累积概率乘以w i 自身的概率

具体算法

(1)对一个待分词的字串S,按照从左到右的顺序取出全部候选词w 1 ,w 2 ,…,w i ,w n ;
(2)计算每个候选词的概率值P(w i ),记录每个候选词的全部左邻词;
(3)计算每个候选词的累计概率,累计概率最大的候选词为最佳左邻词;
如果当前词w n 是字串的尾词,且累计概率P’(w n )最大,则w n 是S的终点词;
(4)从w n 开始,按照从右到左顺序,依次将每个词的最佳左邻词输出,即S的分词结果.

字典树

又称单词查找树，Trie树，是一种树形结构，是一种哈希树的变种。典型应用是用于统计，排序和保存大量的字符串（但不仅限于字符串），所以经常被搜索引擎系统用于文本词频统计。它的优点是：利用字符串的公共前缀来减少查询时间，最大限度地减少无谓的字符串比较，查询效率比哈希树高。

字典树的代码实现

主要参考:http://blog.csdn.net/sadfishsc/article/details/9152647

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.util.HashMap;
import java.util.Map;

/**
 *
 * @author quincy1994
 */
public class TireNode {
    private String character;           //　单个汉字
    private int frequency = -1;       //     词频, -1来区别某条路径上的字串是否是一个词组
    private double antilog = -1;    //      对数化的词频
    private Map<String, TireNode> children;  //下一个节点

    public String getCharacter(){
        return character;
    }

    public void setCharacter(String character){
        this.character = character;
    }

    public int getFrequency(){
        return frequency;
    }

    public void setFrequency(int frequency){
        this.frequency = frequency;
    }

    public double getAntilog(){
        return antilog;
    }

    public void setAntilog(double antilog){
        this.antilog = antilog;
    }

    public void addChild(TireNode node){
        if (children == null){
            children = new HashMap<String, TireNode>();
        }
        if (!children.containsKey(node.getCharacter())){
            children.put(node.getCharacter(), node);
        }
    }

    public TireNode getChild(String ch){
        if (children == null || ! children.containsKey(ch)){
            return null;
        }
        return children.get(ch);
    }

    public void removeChildren(String ch){
        if (children == null || !children.containsKey(ch)){
            return;
        }
        children.remove(ch);
    }
}

  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
   
   52
   
   53
   
   54
   
   55
   
   56
   
   57
   
   58
   
   59
   
   60
   
   61
   
   62
   
   63
   
   64
   
   65
   
   66
   
   67
   
   68

算法实现

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 *
 * @author quincy1994
 */
public class ChnSeq {

    private TireNode tire = null;

    public List<String> loadFile() throws FileNotFoundException, IOException {
        //读取字典
        List<String> lines = new ArrayList<String>();
        String filename = "wordFre.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while ((tmp = br.readLine()) != null) {
            lines.add(tmp);
        }
        br.close();
        return lines;
    }

    public void init() throws IOException {
        List<String> lines = loadFile();
        tire = new TireNode();

        for (String line : lines) {
            String[] tokens = line.split(",");
            String word = tokens[0];
            int freq = Integer.parseInt(tokens[1]);
            double antilog =  Math.log(1+0.01/Double.parseDouble(tokens[2].replace("%", ""))) ;
            //构建词典树
            TireNode root = tire;
            for (int i = 0; i < word.length(); i++) {
                String c = "" + word.charAt(i);
                TireNode node = root.getChild(c);
                if (node == null) {
                    node = new TireNode();
                    node.setCharacter(c);
                    root.addChild(node);
                }
                root = node;
            }
            root.setFrequency(freq);    //为每个词设立词频
            root.setAntilog(antilog);   //为每个词设立逆文档频率
        }

    }

    public TireNode getTire() {
        return tire;
    }

    public TireNode getNodeByWord(String word) {
        TireNode node = tire;
        for (int i = 0; i < word.length(); i++) {
            String ch = word.charAt(i) + "";
            if (node == null) {
                break;
            } else {
                node = node.getChild(ch);
            }
        }
        return node;
    }

    private class Segment {

        public String word;     //词
        public String endChar; //结束词
        public String lastChar; //前缀词
        public double cost;

        public final static String START_SIGN = "<< STARTING >>";
        public final static String END_SIGN = "<< ENDING >>";
    }

    //寻找候选词
    public List<Segment> preSegment(String sentence) {
        List<Segment> segs = new ArrayList<Segment>();

        //设置句子的开始标志
        Segment terminal = new Segment();
        terminal.word = Segment.START_SIGN;
        terminal.endChar = Segment.START_SIGN;
        terminal.lastChar = null;
        segs.add(terminal);

        for (int i = 0; i < sentence.length(); i++) {
            for (int j = i + 1; j <= sentence.length(); j++) {
                String word = sentence.substring(i, j);
                TireNode tnode = this.getNodeByWord(word);
                if (tnode == null) {
                    break;
                }
                if (tnode.getFrequency() <= 0) {
                    continue;
                }

                Segment seg = new Segment();
                seg.word = word;
                seg.endChar = word.substring(word.length() - 1, word.length());
                if (i == 0) {
                    seg.lastChar = Segment.START_SIGN;
                } else {
                    seg.lastChar = sentence.substring(i - 1, i);
                }
                seg.cost = tnode.getAntilog();
                System.out.println(word + " " + seg.cost +" " + tnode.getFrequency());
                segs.add(seg);
            }
        }

        //设置句子的结束标志
        terminal = new Segment();
        terminal.word = Segment.END_SIGN;
        terminal.endChar = Segment.END_SIGN;
        terminal.lastChar = sentence.substring(sentence.length() - 1, sentence.length());
        segs.add(terminal);

        return segs;
    }

    public String dynamicSegment(List<Segment> segs) {

        //基于动态规划的概率统计分词
        final double INFINITE = 9999999;

        if (segs == null || segs.size() == 0) {
            System.out.println("找不到候选词");
            return null;
        }

        int n = segs.size();    //候选词的个数

        //单个词
        double[][] costs = new double[n][n];
        for (int i = 0; i < n - 1; i++) {
            for (int j = 0; j < n; j++) {
                String endChar = segs.get(i).endChar;
                if (j == i && endChar.equals(segs.get(j).word)) {
                    costs[i][j] = segs.get(j).cost;    //候选词j的概率
                    continue;
                }
                costs[i][j] = INFINITE;
            }
        }

        //寻找前一个候选词
        for (int i = 0; i < n - 1; i++) {
            String endChar = segs.get(i).endChar;
            for (int j = i + 1; j < n; j++) {
                String lastChar = segs.get(j).lastChar;
                if (lastChar != null && lastChar.equals(endChar) &&( j- i < 4)) {       //ｊ前缀词不为空，同时ｊ的前缀词等于ｉ的后缀词,且j和i之间的间隔不超过4个候选词
                    costs[i][j] = segs.get(j).cost;    //候选词j的概率
                }
            }
        }

        int sp = 0;   //开始点
        int fp = n - 1;    //结束点

        double[] dist = new double[n];         // 记录累计概率, n为候选词的个数
        List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
        List<Integer> list = new ArrayList<Integer>();
        for (int i = 0; i < n; i++) {
            dist[i] = costs[sp][i];    //ｉ的累计概率的初始值为索引sp到索引ｉ的词的概率
            if (sp != i) {
                list.add(i);   //记录候选词的索引位置
            }
            if (dist[i] < INFINITE) {
                List<Integer> spa = new ArrayList<Integer>();     //如果索引sp到索引ｉ构成一个词，则开启一条划分路径
                sPaths.add(spa);
            } else {
                sPaths.add(null);
            }
        }
        while (!list.isEmpty()) {

            //选切分点
            Integer minIdx = list.get(0);
            list.remove(minIdx);

            //判断minIdx是否为开头的候选词
            if(dist[minIdx] == INFINITE){
                continue;
            }

            //动态规划
            for (int i = minIdx+1; i < n; i++) {
                if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
                    dist[i] = dist[minIdx] + costs[minIdx][i];
                    List<Integer> tmp = new ArrayList<Integer>(sPaths.get(minIdx));
                    tmp.add(minIdx);
                    sPaths.set(i, tmp);  //记录最佳前候选词列表
                }
            }
        }
        String result = "";
        for (int i = 0; i < sPaths.get(fp).size(); i++) {
            result += segs.get(sPaths.get(fp).get(i)).word + "/ ";
        }
        return result;
    }

    public String segment(String sentences) {
        return dynamicSegment(preSegment(sentences));
    }

    public static void main(String[] args) throws ClassNotFoundException, IOException {
        ChnSeq cs = new ChnSeq();
        cs.init();
        String sentence = "在这一年中，改革开放和现代化建设继续向前迈进。经济保持了“高增长、低通胀”的良好发展态势。农业生产再次获得好的收成，企业改革继续深化，人民生活进一步改善。对外经济技术合作与交流不断扩大。";
        String segs = cs.segment(sentence);
        System.out.println(segs);
    }
}

  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
   
   52
   
   53
   
   54
   
   55
   
   56
   
   57
   
   58
   
   59
   
   60
   
   61
   
   62
   
   63
   
   64
   
   65
   
   66
   
   67
   
   68
   
   69
   
   70
   
   71
   
   72
   
   73
   
   74
   
   75
   
   76
   
   77
   
   78
   
   79
   
   80
   
   81
   
   82
   
   83
   
   84
   
   85
   
   86
   
   87
   
   88
   
   89
   
   90
   
   91
   
   92
   
   93
   
   94
   
   95
   
   96
   
   97
   
   98
   
   99
   
   100
   
   101
   
   102
   
   103
   
   104
   
   105
   
   106
   
   107
   
   108
   
   109
   
   110
   
   111
   
   112
   
   113
   
   114
   
   115
   
   116
   
   117
   
   118
   
   119
   
   120
   
   121
   
   122
   
   123
   
   124
   
   125
   
   126
   
   127
   
   128
   
   129
   
   130
   
   131
   
   132
   
   133
   
   134
   
   135
   
   136
   
   137
   
   138
   
   139
   
   140
   
   141
   
   142
   
   143
   
   144
   
   145
   
   146
   
   147
   
   148
   
   149
   
   150
   
   151
   
   152
   
   153
   
   154
   
   155
   
   156
   
   157
   
   158
   
   159
   
   160
   
   161
   
   162
   
   163
   
   164
   
   165
   
   166
   
   167
   
   168
   
   169
   
   170
   
   171
   
   172
   
   173
   
   174
   
   175
   
   176
   
   177
   
   178
   
   179
   
   180
   
   181
   
   182
   
   183
   
   184
   
   185
   
   186
   
   187
   
   188
   
   189
   
   190
   
   191
   
   192
   
   193
   
   194
   
   195
   
   196
   
   197
   
   198
   
   199
   
   200
   
   201
   
   202
   
   203
   
   204
   
   205
   
   206
   
   207
   
   208
   
   209
   
   210
   
   211
   
   212
   
   213
   
   214
   
   215
   
   216
   
   217
   
   218
   
   219
   
   220
   
   221
   
   222
   
   223
   
   224
   
   225
   
   226
   
   227
   
   228
   
   229
   
   230
   
   231

具体的代码和字典，可以访问：
https://github.com/Quincy1994/Segment

扫描二维码关注公众号，回复： 1642121 查看本文章