python爬虫neo4j知识图谱实体的属性补全

本文原创,如有转载请说明

‘’‘大致思路是:从neo4j数据库中拿出A实体及其ID,将A传如一个爬虫程序在百科上搜索采集结果数据。然后将采集到的数据return做为实体A的属性值写入neo4j,在写入的过程中match到A实体的ID能大大提升match写入的速度。’’’

import requests
import urllib
from bs4 import BeautifulSoup
from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ALL_CERTIFICATES

使用neo4j.v1 驱动链接数据库

driver = GraphDatabase.driver(“bolt://111111111:7687”, auth=basic_auth(“neo4j”, “neo4j”))

class Get_datas():
def init(self):
self.session = driver.session() # 初始化驱动
# 这个header是 百度百科的requests headers,为后面的爬虫做准备的。
self.header={
‘Accept’: ’ text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3’,
‘Accept-Encoding’: ’ gzip, deflate, br’,
‘Accept-Language’: ’ zh-CN,zh;q=0.9’,
‘Cache-Control’: ’ max-age=0’,
‘Connection’: ’ keep-alive’,
‘Host’: ‘baike.baidu.com’,
‘Upgrade-Insecure-Requests’: ’ 1’,
‘User-Agent’: ’ Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36’
}

def get_neo4j_orgname(self):
'''这个函数的功能是:从neo4j数据库中拿出orgnames数据及其ID,将orgnames数据传如get_spiders_data这个爬虫程序然后去百科上搜索采集结果数据。然后将采集到的数据return做为orgnames数据得的属性值写入neo4j,在写入的过程中match该orgnames的ID能大大提升match写入的速度。'''
    dis = 0
    coms = self.session.run("MATCH (a:orgnames) RETURN a.name,id(a)").values() 
    set_list = []
    for c in coms[dis:]:
        dis += 1
        print("次数:", dis)
        orgids = c[1] 
        orgname = c[0]
        datad= self.get_spiders_data(orgname)
        if datad:
			# 下面数据清理和写入
            huzs = str(datad).replace('\\n','').replace('xa0','').replace('[3]\\','') 
            huz = huzs.split(',')
            huzd = huz[0].replace("['","").replace("']","").replace(" ","")
            btion = str(huz[1:]).replace("[","").replace("]","").replace("}\\''","")
            query = "match(e) where id(e)=%s set e.descri = '%s', e.btion = '%s' " % (orgids, huzd, btion)  # 写入neo4j
            self.session.run(query)
            # print('成功补充这个机构属性')
    self.browser.close()

def get_spiders_data(self,orgname):
	# 将汉字处理成链接需要的编码,拼接链接请求数据。
    new = urllib.parse.quote(orgname)
    urls = 'https://baike.baidu.com/item/'+new # 链接拼接
    response = requests.get(urls, headers=self.header) # 爬虫请求数据
    soups = BeautifulSoup(response, 'lxml')  # BeautifulSoup 解析数据
    try:
        data1 = soups.find_all("div",class_="lemma-summary") # 获取数据所在的网页前端的标签,
        shuju1 = [d.text for d in data1]  # 去网页标签,留文字内容
        datad = soups.find_all("div",class_="basic-info cmn-clearfix")
        dataleft = soups.find('dl',class_="basicInfo-block basicInfo-left")
        ltdata = str(dataleft).replace('<dl class="basicInfo-block basicInfo-left">\n','{')  # 数据清理
        dataright = soups.find('dl',class_="basicInfo-block basicInfo-right")
        rtdata = str(dataright).replace('<dl class="basicInfo-block basicInfo-right">\n','{') # 数据清理
        if shuju1:
            return shuju1, ltdata, rtdata
    except Exception as e:
        print(e)

def mains(self):
    self.get_neo4j_orgname()

if name == ‘main’:
get = Get_datas()
get.mains()

发布了25 篇原创文章 · 获赞 8 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/for_yayun/article/details/103295160