转载 经典的分词方法实现(JAVA)




基于规则的自动分词算法

原理

(1) 事先人工建立好分词词典和分词规则库。
(2) 原理为基于字符串匹配进行分词,这样就要求有足够大的词表为依据。
(3) 通过一定的算法来实现,如正向最大匹配法、逆向最大匹配法、双向匹配法等。
(4) 忧缺点:当分词词典所收容的词较少时,显然覆盖度就有限,分词的正确率就低。

正向最大匹配法

算法描述

设MaxLen表示最大词长,D为分词词典
(1) 从待切分语料中按正向取长度为MaxLen的字串str,令Len=MaxLen;
(2) 把str与D中的词相匹配;
(3) 若匹配成功,则认为该字串为词,指向待切分语料的指针向前移Len个汉字(字节),返回到(1);
(4) 若不成功:如果Len>1,则将Len减2,从待切分语料中取长度为Len的字串str,返回到(2)。否则,得到长度为2的单字词,指向待切分语料的指针向前移1个汉字,返回(1)。

算法代码

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author quincy1994
 */
public class Nlp {

    private String m_sResult = ""; // 切分后的结果串
    private int m_nPosIndex;  // 指向待切分语料的指针的具体位置
    private int m_MaxLen; // 最大取词长
    private int totalMaxLen; //总最大取词长
    private Set<String> dictionary; // 分词字典

    public Nlp(int maxLen){
        this.m_MaxLen = maxLen;
        this.m_nPosIndex = 0;
        this.totalMaxLen = maxLen;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Nlp(){
        this.m_MaxLen = 3;
        this.totalMaxLen = 3;
        this.m_nPosIndex = 0;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Set<String> loadFile() throws FileNotFoundException, IOException{
        //读取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "dict.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while( ( tmp = br.readLine() )!=null){
            String[] token = tmp.split(",");
            String word = token[0];
            dictionary.add(word);
        }
        return dictionary;
    }
    public String MMSegment(String source){
         int len = totalMaxLen;
         int frompos = 0;
         MM(source, len, frompos);
         return m_sResult;
     }
    public String getSubString(String source, int m_nPosIndex, int len){
        int endIndex = m_nPosIndex + len;
        int length = source.length();

        //需要判断是否超出句子边界
        while(endIndex > length){
            endIndex -= 1;
        }
        String sub = source.substring(m_nPosIndex, endIndex);
        return sub;
    }
    public void MM(String source, int len , int frompos){

        //递归匹配
         if (m_nPosIndex >= source.length()) return;
        String sub = getSubString(source, m_nPosIndex,len);
        if(dictionary.contains(sub)){
            //匹配
            m_sResult += sub + "/ ";
            m_nPosIndex = m_nPosIndex + m_MaxLen;
            m_MaxLen = totalMaxLen;
            MM(source, m_MaxLen, m_nPosIndex);
        }
        else{
            //不匹配
            if(m_MaxLen > 1){
                m_MaxLen = m_MaxLen - 1;
                MM(source, m_MaxLen, m_nPosIndex);
            }
            else{
                m_sResult += sub+ "/ ";
                m_nPosIndex  += 1;
                m_MaxLen = totalMaxLen;
                MM(source, m_MaxLen, m_nPosIndex);
            }
    }
}
    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        // TODO code application logic here
        Nlp nlp = new Nlp();
        String source = "今天天气不错!";
        String result = nlp.MMSegment(source);
        System.out.println(result);
    } 
}

  
  
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116

逆向最大匹配法

算法描述

与正向最大匹配法原理一样,只是匹配的开始为句尾

代码实现

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author quincy1994
 */
public class RMM {
    private String m_sResult = "";         //切分后的结果串
    private int m_nPosIndex;                //游标指针
    private int m_MaxLen;                    //最大取词长
    private int totalMaxlen;                //总最大取词长
    private Set<String> dictionary;      //分词字典

    public RMM(int maxLen){
        this.m_MaxLen = maxLen;
        this.totalMaxlen = maxLen;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public RMM(){
        this.m_MaxLen = 3;
        this.totalMaxlen = 3;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Set<String> loadFile() throws IOException{

        //读取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "dict.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while((tmp=br.readLine())!= null){
            String[] token = tmp.split(",");
            String word = token[0];
            dictionary.add(word);
        }
        return dictionary;
    }
    public String RMMSegment(String source){
        int len= totalMaxlen;
        this.m_nPosIndex = source.length();
        int frompos = this.m_nPosIndex;
        rmm(source, m_MaxLen, m_nPosIndex);

        //将结果按顺序输出
        String[] token = m_sResult.split("/");
        String result = "";
        for(int i = token.length-1; i > 0 ; i--){
            result += token[i] + "/ ";
        }
        return result;
    }
    public String getSubString(String source, int m_nPosIndex, int len){

        int startIndex = m_nPosIndex - len;
        //判断越界条件
        while(startIndex < 0){
            startIndex += 1;
        }
        String sub = source.substring(startIndex, m_nPosIndex);
        return sub;
    }

    public void rmm(String source, int len, int frompos){
         if(m_nPosIndex < 0)  return;
         String sub = getSubString(source, m_nPosIndex, len);
         if(dictionary.contains(sub)){
             //匹配成功
             m_sResult += "/" + sub ;
             m_nPosIndex = m_nPosIndex - m_MaxLen;
             m_MaxLen = totalMaxlen;
             rmm(source, m_MaxLen, m_nPosIndex);
         }
         else{
             //不匹配
             if(m_MaxLen > 1){
                 m_MaxLen = m_MaxLen - 1;
                 rmm(source, m_MaxLen, m_nPosIndex);
             }
             else{
                 m_sResult += "/" + sub ;
                 m_nPosIndex -= 1;
                 m_MaxLen = totalMaxlen;
                 rmm(source, m_MaxLen, m_nPosIndex);
            }
        }
    }
    public static void main(String[] args) {
        // TODO code application logic here
        RMM myRMM = new RMM();
        String source = "记录最佳前候选词列表";
        String result = myRMM.RMMSegment(source);
        System.out.println(result);
    } 
}

  
  
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116

基于统计的中文分词算法

基本思想

选择概率最大的分词路径作为最优结果
利用动态规划算法来实现,即最优路径中的第i个词w i 的累计概率等于它的左相邻词w i-1 的累积概率乘以w i 自身的概率

具体算法

(1)对一个待分词的字串S,按照从左到右的顺序取出全部候选词w 1 ,w 2 ,…,w i ,w n ;
(2)计算每个候选词的概率值P(w i ),记录每个候选词的全部左邻词;
(3)计算每个候选词的累计概率,累计概率最大的候选词为最佳左邻词;
如果当前词w n 是字串的尾词,且累计概率P’(w n )最大,则w n 是S的终点词;
(4)从w n 开始,按照从右到左顺序,依次将每个词的最佳左邻词输出,即S的分词结果.

字典树

又称单词查找树,Trie树,是一种树形结构,是一种哈希树的变种。典型应用是用于统计,排序和保存大量的字符串(但不仅限于字符串),所以经常被搜索引擎系统用于文本词频统计。它的优点是:利用字符串的公共前缀来减少查询时间,最大限度地减少无谓的字符串比较,查询效率比哈希树高。

字典树的代码实现

主要参考:http://blog.csdn.net/sadfishsc/article/details/9152647

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.util.HashMap;
import java.util.Map;

/**
 *
 * @author quincy1994
 */
public class TireNode {
    private String character;           // 单个汉字
    private int frequency = -1;       //     词频, -1来区别某条路径上的字串是否是一个词组
    private double antilog = -1;    //      对数化的词频
    private Map<String, TireNode> children;  //下一个节点

    public String getCharacter(){
        return character;
    }

    public void setCharacter(String character){
        this.character = character;
    }

    public int getFrequency(){
        return frequency;
    }

    public void setFrequency(int frequency){
        this.frequency = frequency;
    }

    public double getAntilog(){
        return antilog;
    }

    public void setAntilog(double antilog){
        this.antilog = antilog;
    }

    public void addChild(TireNode node){
        if (children == null){
            children = new HashMap<String, TireNode>();
        }
        if (!children.containsKey(node.getCharacter())){
            children.put(node.getCharacter(), node);
        }
    }

    public TireNode getChild(String ch){
        if (children == null || ! children.containsKey(ch)){
            return null;
        }
        return children.get(ch);
    }

    public void removeChildren(String ch){
        if (children == null || !children.containsKey(ch)){
            return;
        }
        children.remove(ch);
    }
}

  
  
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68

算法实现

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 *
 * @author quincy1994
 */
public class ChnSeq {

    private TireNode tire = null;

    public List<String> loadFile() throws FileNotFoundException, IOException {
        //读取字典
        List<String> lines = new ArrayList<String>();
        String filename = "wordFre.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while ((tmp = br.readLine()) != null) {
            lines.add(tmp);
        }
        br.close();
        return lines;
    }

    public void init() throws IOException {
        List<String> lines = loadFile();
        tire = new TireNode();

        for (String line : lines) {
            String[] tokens = line.split(",");
            String word = tokens[0];
            int freq = Integer.parseInt(tokens[1]);
            double antilog =  Math.log(1+0.01/Double.parseDouble(tokens[2].replace("%", ""))) ;
            //构建词典树
            TireNode root = tire;
            for (int i = 0; i < word.length(); i++) {
                String c = "" + word.charAt(i);
                TireNode node = root.getChild(c);
                if (node == null) {
                    node = new TireNode();
                    node.setCharacter(c);
                    root.addChild(node);
                }
                root = node;
            }
            root.setFrequency(freq);    //为每个词设立词频
            root.setAntilog(antilog);   //为每个词设立逆文档频率
        }

    }

    public TireNode getTire() {
        return tire;
    }

    public TireNode getNodeByWord(String word) {
        TireNode node = tire;
        for (int i = 0; i < word.length(); i++) {
            String ch = word.charAt(i) + "";
            if (node == null) {
                break;
            } else {
                node = node.getChild(ch);
            }
        }
        return node;
    }

    private class Segment {

        public String word;     //词
        public String endChar; //结束词
        public String lastChar; //前缀词
        public double cost;

        public final static String START_SIGN = "<< STARTING >>";
        public final static String END_SIGN = "<< ENDING >>";
    }

    //寻找候选词
    public List<Segment> preSegment(String sentence) {
        List<Segment> segs = new ArrayList<Segment>();

        //设置句子的开始标志
        Segment terminal = new Segment();
        terminal.word = Segment.START_SIGN;
        terminal.endChar = Segment.START_SIGN;
        terminal.lastChar = null;
        segs.add(terminal);

        for (int i = 0; i < sentence.length(); i++) {
            for (int j = i + 1; j <= sentence.length(); j++) {
                String word = sentence.substring(i, j);
                TireNode tnode = this.getNodeByWord(word);
                if (tnode == null) {
                    break;
                }
                if (tnode.getFrequency() <= 0) {
                    continue;
                }

                Segment seg = new Segment();
                seg.word = word;
                seg.endChar = word.substring(word.length() - 1, word.length());
                if (i == 0) {
                    seg.lastChar = Segment.START_SIGN;
                } else {
                    seg.lastChar = sentence.substring(i - 1, i);
                }
                seg.cost = tnode.getAntilog();
                System.out.println(word + " " + seg.cost +" " + tnode.getFrequency());
                segs.add(seg);
            }
        }

        //设置句子的结束标志
        terminal = new Segment();
        terminal.word = Segment.END_SIGN;
        terminal.endChar = Segment.END_SIGN;
        terminal.lastChar = sentence.substring(sentence.length() - 1, sentence.length());
        segs.add(terminal);

        return segs;
    }

    public String dynamicSegment(List<Segment> segs) {

        //基于动态规划的概率统计分词
        final double INFINITE = 9999999;

        if (segs == null || segs.size() == 0) {
            System.out.println("找不到候选词");
            return null;
        }

        int n = segs.size();    //候选词的个数

        //单个词
        double[][] costs = new double[n][n];
        for (int i = 0; i < n - 1; i++) {
            for (int j = 0; j < n; j++) {
                String endChar = segs.get(i).endChar;
                if (j == i && endChar.equals(segs.get(j).word)) {
                    costs[i][j] = segs.get(j).cost;    //候选词j的概率
                    continue;
                }
                costs[i][j] = INFINITE;
            }
        }

        //寻找前一个候选词
        for (int i = 0; i < n - 1; i++) {
            String endChar = segs.get(i).endChar;
            for (int j = i + 1; j < n; j++) {
                String lastChar = segs.get(j).lastChar;
                if (lastChar != null && lastChar.equals(endChar) &&( j- i < 4)) {       //j前缀词不为空,同时j的前缀词等于i的后缀词,且j和i之间的间隔不超过4个候选词
                    costs[i][j] = segs.get(j).cost;    //候选词j的概率
                }
            }
        }

        int sp = 0;   //开始点
        int fp = n - 1;    //结束点

        double[] dist = new double[n];         // 记录累计概率, n为候选词的个数
        List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
        List<Integer> list = new ArrayList<Integer>();
        for (int i = 0; i < n; i++) {
            dist[i] = costs[sp][i];    //i的累计概率的初始值为索引sp到索引i的词的概率
            if (sp != i) {
                list.add(i);   //记录候选词的索引位置
            }
            if (dist[i] < INFINITE) {
                List<Integer> spa = new ArrayList<Integer>();     //如果索引sp到索引i构成一个词,则开启一条划分路径
                sPaths.add(spa);
            } else {
                sPaths.add(null);
            }
        }
        while (!list.isEmpty()) {

            //选切分点
            Integer minIdx = list.get(0);
            list.remove(minIdx);

            //判断minIdx是否为开头的候选词
            if(dist[minIdx] == INFINITE){
                continue;
            }

            //动态规划
            for (int i = minIdx+1; i < n; i++) {
                if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
                    dist[i] = dist[minIdx] + costs[minIdx][i];
                    List<Integer> tmp = new ArrayList<Integer>(sPaths.get(minIdx));
                    tmp.add(minIdx);
                    sPaths.set(i, tmp);  //记录最佳前候选词列表
                }
            }
        }
        String result = "";
        for (int i = 0; i < sPaths.get(fp).size(); i++) {
            result += segs.get(sPaths.get(fp).get(i)).word + "/ ";
        }
        return result;
    }

    public String segment(String sentences) {
        return dynamicSegment(preSegment(sentences));
    }

    public static void main(String[] args) throws ClassNotFoundException, IOException {
        ChnSeq cs = new ChnSeq();
        cs.init();
        String sentence = "在这一年中,改革开放和现代化建设继续向前迈进。经济保持了“高增长、低通胀”的良好发展态势。农业生产再次获得好的收成,企业改革继续深化,人民生活进一步改善。对外经济技术合作与交流不断扩大。";
        String segs = cs.segment(sentence);
        System.out.println(segs);
    }
}

  
  
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231

具体的代码和字典,可以访问:
https://github.com/Quincy1994/Segment

扫描二维码关注公众号,回复: 1642121 查看本文章




阅读更多



基于规则的自动分词算法

猜你喜欢

转载自blog.csdn.net/u011289652/article/details/80535324