爬数据,专利的名字及摘要

# -*- coding:UTF-8 -*-
#########################################################################
# File Name: getsoopt.py
# Author: Ev
# mail: [email protected]
# Created Time: Mon 24 Dec 2018 10:35:12 AM CST
#########################################################################
#!/usr/bin/python
import sys
import requests
import re
from bs4 import BeautifulSoup 
def get_html(url):    
    headers = {
            'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\        AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'     }     #模拟浏览器访问  
    response = requests.get(url,headers = headers)       #请求访问网站 
    #with open('./1.html','w+') as f:
    #    f.write(response.text.encode('utf-8'))
    html = response.text       #获取网页源码  
    return html                #返回网页源码

index = 27
soup = BeautifulSoup(get_html("http://www.soopat.com/....PatentIndex=" + str(index*10)),"lxml")
#soup = BeautifulSoup(open("./1.html"),"lxml")
reload(sys)
sys.setdefaultencoding('utf-8')
if "请输入验证码" in soup.title.string:
    print soup.title.string
    sys.exit()
print "get result ok!\n"
#p = soup.body.attr
title = []
p = soup.find_all(class_="PatentTypeBlock")
for m in p:
    titleTemp = m.find("a").get_text()
    #print type(titleTemp)
    title.append(titleTemp)

content = []
p = soup.find_all(class_="PatentContentBlock")
for m in p:
    titleTemp = m.get_text()
    #print type(titleTemp)
    content.append(titleTemp)
#    break;
with open("get.txt","a+") as f:
    for i in range(len(content)):
        f.write(str(index*10+i) + ":")
        f.write(title[i])
        f.write("\n")
        f.write(content[i])
        f.write("\n\n")

网页是专利关键字搜索的结果

我是在ubuntu上使用python+BeautifulSoup+requests,环境的搭建直接百度

index是页数,0代表第一页,以此类推

这个脚本的目的是抓取专利的名字及简单摘要,以方便参考和规避^_^

脚本缺点就是,只能一页一页的运行,运行几次之后得输验证码,目前我不知道怎么办

猜你喜欢

转载自www.cnblogs.com/shushanxiaoyao/p/10172174.html