NLP base - named entity recognition (a) rule-based

Disclaimer: This article is a blogger original article, shall not be reproduced without the bloggers allowed. https://blog.csdn.net/qq_29027865/article/details/91817658

Named entity recognition

NER (Named Entity Recognition, referred NER)
and automatic segmentation, POS tagging, as named entity recognition is natural language processing is a basic task, which aims to place names, organization names and other identifying corpus named entities.

Rule-based methods are usually two
first is based on regular expression matching, by the second StanfordCoreNLP

StanfordCoreNLP方法:

ner.py : Lord calls the file for reading text

#-*- coding=utf8 -*-
from grammer.rules import grammer_parse
fp = open('text.txt', 'r', encoding='utf8')
fout = open('out.txt','w',encoding='utf8')
# 此处可以直接遍历,也可以通过遍历readlines
[grammer_parse(line.strip(), fout) for line in fp if len(line.strip())>0]
fp.close()
fout.close()

rules.py : This code is used to define the rules

#encoding=utf8
import nltk,json

from .tools import ner_stanford,cut_stanford
def get_stanford_ner_nodes(parent):
    # 对得到的树进行遍历
    date=''
    num=''
    org=''
    loc=''
    for node in parent:
        if type(node) is nltk.Tree:
            if node.label() == 'DATE' :
                date=date+" "+''.join([i[0]  for i in node])
            elif node.label() == 'NUMBER':  
                num=num+" "+''.join([i[0]  for i in node])
                
            elif node.label() == 'ORGANIZATIONL' :
                org=org+" "+''.join([i[0]  for i in node])
            elif node.label() == 'LOCATION': 
                loc=loc+" "+''.join([i[0]  for i in node])
    if len(num)>0 or len(date)>0 or len(org)>0  or len(loc)>0 :
        return {'date':date,'num':num,'org':org,'loc':loc}    
    else:
        return {}
def grammer_parse(raw_sentence=None,file_object=None):
    #assert grammer_type in set(['hanlp_keep','stanford_ner_drop','stanford_pos_drop'])
    # 如果文本太短,则直接跳过
    if len(raw_sentence.strip())<5:
        return False
    # 定义语法:<DATE>+  只要Date出现,一次或者多次,都是属于一个Date
    grammer_dict=\
    {
    
         'stanford_ner_drop': r"""
        DATE:{<DATE>+<MISC>?<DATE>*<O>{2}}
        {<DATE>+<MISC>?<DATE>*}
        {<DATE>+}
        {<TIME>+}
        ORGANIZATIONL:{<ORGANIZATION>+}
        LOCATION:{<LOCATION|STATE_OR_PROVINCE|CITY|COUNTRY>+}
        """     
    }
    # 通过NLTK来对语法进行解析
    stanford_ner_drop_rp = nltk.RegexpParser(grammer_dict['stanford_ner_drop'])
    try :
        # ner_stanford(raw_sentence)就是将关键字命名体进行了识别,O指的意思是没有我们规定的类型
        # 得到的stanford_ner_drop_result为draw类型,可以通过draw()方法进行绘制
        stanford_ner_drop_result = stanford_ner_drop_rp.parse(ner_stanford(raw_sentence) )
        
    except:
        print("the error sentence is {}".format(raw_sentence))
    else:
        # 将得到的树类型的结果按照规则对结点进行合并
        stanford_keep_drop_dict=get_stanford_ner_nodes(stanford_ner_drop_result)
        if len(stanford_keep_drop_dict)>0 :
            # 将字典写入文件,通过json.dumps将字符串转化为json数据
            file_object.write(json.dumps(stanford_keep_drop_dict, skipkeys=False, 
                                        ensure_ascii=False, 
                                        check_circular=True, 
                                        allow_nan=True, 
                                        cls=None, 
                                        indent=4, 
                                        separators=None, 
                                        default=None, 
                                        sort_keys=False))

tools.py : to support stanford_nlp

#encoding=utf8
import os,gc,re,sys
from itertools import chain
from stanfordcorenlp import StanfordCoreNLP
import logging
from jpype import *

startJVM(getDefaultJVMPath(),r"-Djava.class.path=E:\NLP\hanlp\hanlp-1.5.0.jar;E:\NLP\hanlp",
         "-Xms1g",
         "-Xmx1g")
NLPTokenizer = JClass('com.hankcs.hanlp.tokenizer.StandardTokenizer')

stanford_nlp = StanfordCoreNLP(r'E:\NLP\stanford-corenlp-full-2018-10-05', lang='zh', quiet=False, logging_level=logging.DEBUG)
# stanford_nlp = StanfordCoreNLP(r'E:\NLP\stanford-corenlp-full-2018-10-05', lang='zh')


drop_pos_set=set(['xu','xx','y','yg','wh','wky','wkz','wp','ws','wyy','wyz','wb','u','ud','ude1','ude2','ude3','udeng','udh'])
han_pattern=re.compile(r'[^\dA-Za-z\u3007\u4E00-\u9FCB\uE815-\uE864]+')

def to_string(sentence,return_generator=False):
    if return_generator:
        return (word_pos_item.toString().split('/') for word_pos_item in Tokenizer.segment(sentence))
    else:
       # res=[(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]
        return [(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]   
def to_string_hanlp(sentence,return_generator=False):
    if return_generator:
        return (word_pos_item.toString().split('/') for word_pos_item in HanLP.segment(sentence))
    else:
       # res=[(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]
        return [(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]      
def seg_sentences(sentence,with_filter=True,return_generator=False):  
    segs=to_string(sentence,return_generator=return_generator)
    #print(segs)
    #g=[]
    if with_filter:
        g = [word_pos_pair[0] for word_pos_pair in segs if len(word_pos_pair)==2 and word_pos_pair[0]!=' ' and word_pos_pair[1] not in drop_pos_set]
    else:
        g = [word_pos_pair[0] for word_pos_pair in segs if len(word_pos_pair)==2 and word_pos_pair[0]!=' ']
    return iter(g) if return_generator else g
def ner_stanford(raw_sentence,return_list=True):
    if len(raw_sentence.strip())>0:
        return stanford_nlp.ner(raw_sentence) if return_list else iter(stanford_nlp.ner(raw_sentence))
def ner_hanlp(raw_sentence,return_list=True):
    if len(raw_sentence.strip())>0:
        return NLPTokenizer.segment(raw_sentence) if return_list else iter(NLPTokenizer.segment(raw_sentence))
def cut_stanford(raw_sentence,return_list=True):
    if len(raw_sentence.strip())>0:
        return stanford_nlp.pos_tag(raw_sentence) if return_list else iter(stanford_nlp.pos_tag(raw_sentence))

Guess you like

Origin blog.csdn.net/qq_29027865/article/details/91817658