爬虫-百度贴吧

  • 爬虫-百度贴吧

# coding=utf-8
import urllib
import urllib2
from lxml import etree


def loadPage(url):
    # h={
    #     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/;"
    # }
    # request=urllib2.Request(url,headers=h)
    request = urllib2.Request(url)
    html=urllib2.urlopen(request).read()
    # print(html)
    # 解析HTML文档为HTML DOM模型
    con=etree.HTML(html)
    # con=etree.HTML(html)
    # 返回的是超链接<a></a>里面的href
    # 返回所有匹配成功的列表集合
    # print(con)
    link_list = con.xpath(r'//div[@class ="t_con cleafix"]/div[2]/div/div/a/@href')

    # // div[ @class ="t_con cleafix"] / div / div / div / a / @ href
    # print("正在取链接")
    # print(link_list)
    for link in link_list:
        # 组合为每个帖子的链接
        full_link="http://tieba.baidu.com"+link
        loadImg(full_link)

#     str="/p/4935479958"
#     full_link = "http://tieba.baidu.com" + str
#     loadImg(full_link)


# 取出每个帖子里面的图片链接
def loadImg(link):
    h={
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/;"
    }
    request=urllib2.Request(link, headers=h)
    # 每个帖子的HTML
    html=urllib2.urlopen(request).read()
    # 解析
    content=etree.HTML(html)
    # 返回帖子里面,所有图片链接的列表集合
    # 取出帖子里,每层层主发送的图片链接
    link_list=content.xpath('//img[@class="BDE_Image"]/@src')
    # 取出每个图片链接
    for link in link_list:
        writeImg(link)


def writeImg(link):
    h={
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/;"
    }
    # 文件写入
    request=urllib2.Request(link, headers=h)
    # 图片原始数据
    image=urllib2.urlopen(request).read()
    # 文件操作
    filename=link[-10:]
    with open(filename.decode('utf-8'), 'wb') as f:
        f.write(image)
    print("已经成功下载"+filename)


def tiebaSpider(url, beginPage, endPage):
    for page in range(beginPage, endPage+1):
        pn=(page-1)*50
        # filename="第"+str(page)+"页.html"
        fullurl=url+"&pn="+ str(pn)
        # print(fullurl)
        loadPage(fullurl)
        # print(html)
        # writeImg(html)
    print("谢谢使用!")


if __name__ == "__main__":
    kw=raw_input("请输入需要爬取的贴吧名字:")
    beginPage=int(raw_input("请输入起始页:"))
    endPage=int(raw_input("请输入结束页:"))

    url="https://tieba.baidu.com/f?"
    key=urllib.urlencode({"kw":kw})
    fullurl=url+key
    tiebaSpider(fullurl, beginPage, endPage)

猜你喜欢

转载自blog.csdn.net/mr_muli/article/details/80035252