依存句法树解析

#encoding=utf8
import re,os,json
from stanfordParse import pos
from stanfordParse import parse_sentence
from recursionSearch import search
def split_long_sentence_by_pos(text):
    del_flag=['DEC','AD','DEG','DER','DEV','SP','AS','ETC','SP','MSP','IJ','ON','JJ','FW','LB','SB','BA','AD','PN','RB']
    pos_tag=pos(text)
    new_str=''
    for apos in pos_tag:
        if apos[1] not in del_flag:
            new_str+=apos[0]
    return new_str
def extract_parallel(text):
    parallel_text=[]
    pattern=re.compile('[，,][\u4e00-\u9fa5]{2,4}[，,]')
    search_obj=pattern.search(text)
    if search_obj:
        start_start,end=search_obj.span()
        rep=text[start_start:end-2]
        rep1=text[start_start:end-1]
        if '，' in rep1:
            rep1.replace('，','、')
        if ',' in rep1:
            rep1.replace(',','、')  
        text.replace(rep1,text)
        parallel_text.append(rep[1:])
        text_leave=text.replace(rep,'')
        while pattern.search(text_leave):
            start,end=pattern.search(text_leave).span()
            rep=text_leave[start:end-2]
            rep1=text[start_start:end-1]
            if '，' in rep1:
                rep1.replace('，','、')
            if ',' in rep1:
                rep1.replace(',','、')  
            text.replace(rep1,text)            
            text_leave=text_leave.replace(rep,'')
            parallel_text.append(rep[1:])
        
        return parallel_text,text
    else:
        return None,text
            
def split_long_sentence_by_sep(text):
    segment=[]
    if '。' or '.' or '!' or '！' or '?' or '？' or ';' or '；' in text:
        text=re.split(r'[。.!！?？;；]',text)
        for seg in text:
            if seg=='' or seg==' ':
                continue
            para,seg=extract_parallel(seg)
            if len(seg)>19:
                seg=split_long_sentence_by_pos(seg)
                if len(seg)>19:       
                    seg=re.split('[，,]',seg)
                    if isinstance(seg,list) and '' in seg:
                        seg=seg.remove('')
                    if isinstance(seg, list) and ' ' in seg:
                        seg=seg.remove(' ')                      
            segment.append(seg)            
    return segment

def read_data(path):
    
    return open(path,"r",encoding="utf8")           
             
def get_np_words(t):
    noun_phrase_list=[]
    for tree in t.subtrees(lambda t:t.height()==3):
        if tree.label()=='NP' and len(tree.leaves())>1:
            noun_phrase=''.join(tree.leaves())
            noun_phrase_list.append(noun_phrase)
    return noun_phrase_list

def get_n_v_pair(t):
    for tree in t.subtrees(lambda t:t.height()==3):
        if tree.label()=='NP' and len(tree.leaves())>1:
            noun_phrase=''.join(tree.leaves())

    
if __name__=="__main__":
    out=open("dependency.txt",'w',encoding='utf8')
    itera=read_data('text.txt')
    for it in itera:
        s=parse_sentence(it)   # 通过Stanfordnlp依存句法分析得到一个句法树 用nltk包装成树的结构
        res=search(s)          # 使用nltk遍历树，然后把短语合并
        print(res)

recursionSearch.py：

#encoding=utf8
import nltk.tree as tree
import nltk

def get_vn_pair():
    pass
def get_noun_chunk(tree):
    noun_chunk=[]
    if tree.label()=="NP":
        nouns_phase=''.join(tree.leaves())
        noun_chunk.append(nouns_phase)   
    return noun_chunk

def get_ip_recursion_noun(tree):
    np_list=[]
    if len(tree)==1:
        tr=tree[0]
        get_ip_recursion_noun(tr)
    if len(tree)==2:
        tr=tree[0]
        get_ip_recursion_noun(tr)        
        tr=tree[1]
        get_ip_recursion_noun(tr)        
    if len(tree)==3:
        tr=tree[0]
        get_ip_recursion_noun(tr)        
        tr=tree[1]
        get_ip_recursion_noun(tr)       
        tr=tree[2]
        get_ip_recursion_noun(tr)    
    if tree.label()=='NP':
        np_list.append(get_noun_chunk(tree))
    return np_list



def get_vv_loss_np(tree):
    if not isinstance(tree,nltk.tree.Tree):
        return False
    stack=[]
    np=[]
    stack.append(tree)
    current_tree=''
    while stack:
        current_tree=stack.pop()
        if isinstance(current_tree,nltk.tree.Tree) and current_tree.label()=='VP':
            continue        
        elif isinstance(current_tree,nltk.tree.Tree) and current_tree.label()!='NP':
            for i in range(len(current_tree)):                
                stack.append(current_tree[i])
        elif isinstance(current_tree,nltk.tree.Tree) and current_tree.label()=='NP':
            np.append(get_noun_chunk(tree))
    if np:
        return np
    else:
        return False
            
def search(tree_in):                                         # 遍历刚才构建的树
    if not isinstance(tree_in,nltk.tree.Tree):
        return False    
    vp_pair=[]  
    stack=[]
    stack.append(tree_in)
    current_tree=''
    while stack:
        tree=stack.pop()
        if isinstance(tree,nltk.tree.Tree) and tree.label()=="ROOT":    # 要处理的文本的语句
            for i in range(len(tree)):
                stack.append(tree[i])	    
        if isinstance(tree,nltk.tree.Tree) and tree.label()=="IP":      # 简单从句
            for i in range(len(tree)):
                stack.append(tree[i])	          
        if isinstance(tree,nltk.tree.Tree) and tree.label()=="VP":      # 动词短语
            duplicate=[]
            if len(tree)>=2:
                for i in range(1,len(tree)):
                    if tree[0].label()=='VV' and tree[i].label()=="NP":  # 动词 和 名词短语
                        verb=''.join(tree[0].leaves())               # 合并动词 leaves是分词
                        noun=get_noun_chunk(tree[i])
                        if verb and noun:
                            vp_pair.append((verb,noun))                 # 返回 动名词短语对
                            duplicate.append(noun)
                    elif tree[0].label()=='VV' and tree[i].label()!="NP":
                        noun=get_vv_loss_np(tree)
                        verb=''.join(tree[0].leaves())
                        if verb and noun and noun not in duplicate:
                            duplicate.append(noun)
                            vp_pair.append((verb,noun))
    if vp_pair:
        return vp_pair
    else:
        return False                        


    #if tree.label()=="NP":
        #nouns_phase=''.join(tree.leaves())
        #noun_chunk.append(nouns_phase)

stanfordParse.py：

#encoding=utf8
from stanfordcorenlp import StanfordCoreNLP
from nltk import Tree, ProbabilisticTree
nlp = StanfordCoreNLP('/home/lhq/桌面/NLP_basis/stanfordnlp', lang='zh')
import nltk,re


grammer = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammer)                         #生成规则
pattern=re.compile(u'[^a-zA-Z\u4E00-\u9FA5]')
pattern_del=re.compile('(\a-zA-Z0-9+)')


def _replace_c(text):
    """
    将英文标点符号替换成中文标点符号，并去除html语言的一些标志等噪音
    :param text:
    :return:
    """
    intab = ",?!()"
    outtab = "，？！（）"
    deltab = " \n<li>< li>+_-.><li \U0010fc01 _"
    trantab=text.maketrans(intab, outtab,deltab)
    return text.translate(trantab)


def parse_sentence(text):
    text=_replace_c(text)          # 文本去噪
    try:
        if len(text.strip())>6:    # 判断，文本是否大于6个字，小于6个字的我们认为不是句子
            return Tree.fromstring(nlp.parse(text.strip()))        # nlp.parse(text.strip())：是将句子变成依存句法树  Tree.fromstring是将str类型的树转换成nltk的结构的树
    except:
        pass


def pos(text):
    text=_replace_c(text)
    if len(text.strip())>6:
        return nlp.pos_tag(text)
    else:
        return False

def denpency_parse(text):
    return nlp.dependency_parse(text)

from nltk.chunk.regexp import *

猜你喜欢