#encoding=utf8
import re,os,json
from stanfordParse import pos
from stanfordParse import parse_sentence
from recursionSearch import search
def split_long_sentence_by_pos(text):
del_flag=['DEC','AD','DEG','DER','DEV','SP','AS','ETC','SP','MSP','IJ','ON','JJ','FW','LB','SB','BA','AD','PN','RB']
pos_tag=pos(text)
new_str=''
for apos in pos_tag:
if apos[1] not in del_flag:
new_str+=apos[0]
return new_str
def extract_parallel(text):
parallel_text=[]
pattern=re.compile('[,,][\u4e00-\u9fa5]{2,4}[,,]')
search_obj=pattern.search(text)
if search_obj:
start_start,end=search_obj.span()
rep=text[start_start:end-2]
rep1=text[start_start:end-1]
if ',' in rep1:
rep1.replace(',','、')
if ',' in rep1:
rep1.replace(',','、')
text.replace(rep1,text)
parallel_text.append(rep[1:])
text_leave=text.replace(rep,'')
while pattern.search(text_leave):
start,end=pattern.search(text_leave).span()
rep=text_leave[start:end-2]
rep1=text[start_start:end-1]
if ',' in rep1:
rep1.replace(',','、')
if ',' in rep1:
rep1.replace(',','、')
text.replace(rep1,text)
text_leave=text_leave.replace(rep,'')
parallel_text.append(rep[1:])
return parallel_text,text
else:
return None,text
def split_long_sentence_by_sep(text):
segment=[]
if '。' or '.' or '!' or '!' or '?' or '?' or ';' or ';' in text:
text=re.split(r'[。.!!??;;]',text)
for seg in text:
if seg=='' or seg==' ':
continue
para,seg=extract_parallel(seg)
if len(seg)>19:
seg=split_long_sentence_by_pos(seg)
if len(seg)>19:
seg=re.split('[,,]',seg)
if isinstance(seg,list) and '' in seg:
seg=seg.remove('')
if isinstance(seg, list) and ' ' in seg:
seg=seg.remove(' ')
segment.append(seg)
return segment
def read_data(path):
return open(path,"r",encoding="utf8")
def get_np_words(t):
noun_phrase_list=[]
for tree in t.subtrees(lambda t:t.height()==3):
if tree.label()=='NP' and len(tree.leaves())>1:
noun_phrase=''.join(tree.leaves())
noun_phrase_list.append(noun_phrase)
return noun_phrase_list
def get_n_v_pair(t):
for tree in t.subtrees(lambda t:t.height()==3):
if tree.label()=='NP' and len(tree.leaves())>1:
noun_phrase=''.join(tree.leaves())
if __name__=="__main__":
out=open("dependency.txt",'w',encoding='utf8')
itera=read_data('text.txt')
for it in itera:
s=parse_sentence(it) # 通过Stanfordnlp依存句法分析得到一个句法树 用nltk包装成树的结构
res=search(s) # 使用nltk遍历树,然后把短语合并
print(res)
recursionSearch.py:
#encoding=utf8
import nltk.tree as tree
import nltk
def get_vn_pair():
pass
def get_noun_chunk(tree):
noun_chunk=[]
if tree.label()=="NP":
nouns_phase=''.join(tree.leaves())
noun_chunk.append(nouns_phase)
return noun_chunk
def get_ip_recursion_noun(tree):
np_list=[]
if len(tree)==1:
tr=tree[0]
get_ip_recursion_noun(tr)
if len(tree)==2:
tr=tree[0]
get_ip_recursion_noun(tr)
tr=tree[1]
get_ip_recursion_noun(tr)
if len(tree)==3:
tr=tree[0]
get_ip_recursion_noun(tr)
tr=tree[1]
get_ip_recursion_noun(tr)
tr=tree[2]
get_ip_recursion_noun(tr)
if tree.label()=='NP':
np_list.append(get_noun_chunk(tree))
return np_list
def get_vv_loss_np(tree):
if not isinstance(tree,nltk.tree.Tree):
return False
stack=[]
np=[]
stack.append(tree)
current_tree=''
while stack:
current_tree=stack.pop()
if isinstance(current_tree,nltk.tree.Tree) and current_tree.label()=='VP':
continue
elif isinstance(current_tree,nltk.tree.Tree) and current_tree.label()!='NP':
for i in range(len(current_tree)):
stack.append(current_tree[i])
elif isinstance(current_tree,nltk.tree.Tree) and current_tree.label()=='NP':
np.append(get_noun_chunk(tree))
if np:
return np
else:
return False
def search(tree_in): # 遍历刚才构建的树
if not isinstance(tree_in,nltk.tree.Tree):
return False
vp_pair=[]
stack=[]
stack.append(tree_in)
current_tree=''
while stack:
tree=stack.pop()
if isinstance(tree,nltk.tree.Tree) and tree.label()=="ROOT": # 要处理的文本的语句
for i in range(len(tree)):
stack.append(tree[i])
if isinstance(tree,nltk.tree.Tree) and tree.label()=="IP": # 简单从句
for i in range(len(tree)):
stack.append(tree[i])
if isinstance(tree,nltk.tree.Tree) and tree.label()=="VP": # 动词短语
duplicate=[]
if len(tree)>=2:
for i in range(1,len(tree)):
if tree[0].label()=='VV' and tree[i].label()=="NP": # 动词 和 名词短语
verb=''.join(tree[0].leaves()) # 合并动词 leaves是分词
noun=get_noun_chunk(tree[i])
if verb and noun:
vp_pair.append((verb,noun)) # 返回 动名词短语对
duplicate.append(noun)
elif tree[0].label()=='VV' and tree[i].label()!="NP":
noun=get_vv_loss_np(tree)
verb=''.join(tree[0].leaves())
if verb and noun and noun not in duplicate:
duplicate.append(noun)
vp_pair.append((verb,noun))
if vp_pair:
return vp_pair
else:
return False
#if tree.label()=="NP":
#nouns_phase=''.join(tree.leaves())
#noun_chunk.append(nouns_phase)
stanfordParse.py:
#encoding=utf8
from stanfordcorenlp import StanfordCoreNLP
from nltk import Tree, ProbabilisticTree
nlp = StanfordCoreNLP('/home/lhq/桌面/NLP_basis/stanfordnlp', lang='zh')
import nltk,re
grammer = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammer) #生成规则
pattern=re.compile(u'[^a-zA-Z\u4E00-\u9FA5]')
pattern_del=re.compile('(\a-zA-Z0-9+)')
def _replace_c(text):
"""
将英文标点符号替换成中文标点符号,并去除html语言的一些标志等噪音
:param text:
:return:
"""
intab = ",?!()"
outtab = ",?!()"
deltab = " \n<li>< li>+_-.><li \U0010fc01 _"
trantab=text.maketrans(intab, outtab,deltab)
return text.translate(trantab)
def parse_sentence(text):
text=_replace_c(text) # 文本去噪
try:
if len(text.strip())>6: # 判断,文本是否大于6个字,小于6个字的我们认为不是句子
return Tree.fromstring(nlp.parse(text.strip())) # nlp.parse(text.strip()):是将句子变成依存句法树 Tree.fromstring是将str类型的树转换成nltk的结构的树
except:
pass
def pos(text):
text=_replace_c(text)
if len(text.strip())>6:
return nlp.pos_tag(text)
else:
return False
def denpency_parse(text):
return nlp.dependency_parse(text)
from nltk.chunk.regexp import *