百度贴吧帖子图片爬虫

import requests
from lxml import html
import requests
import time
url="https://tieba.baidu.com/f"
# //div[contains(@class,"threadlist_title pull_left")]/a/@href
# //div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div/a/@href

def GetHtml(keywords,pages):
    global url
    kv={'User-Agent': 'Mozilla/5.0'}
    for i in range(0,pages):
        File_name = '贴吧'+keywords+'第'+str(i)+'页' + '.html'
        i=i*50
        kv2={'kw' : keywords , 'pn' : i}
        time.sleep(3)
        r=requests.get(url,params=kv2,headers=kv)
        r.encoding="utf8"
        File_name=r.text
        getLINK(File_name)
        # print(r.text)
        # with open('1.html','w',encoding="utf8") as TB_File:
        #     TB_File.write(r.text)

def getLINK(File_name):
    # with open(File_name,encoding="utf8") as f:
    global url
    content=File_name
    content=content.replace('<!--','').replace('-->','')
    LISTtree=html.etree.HTML(content)
    link_list1=LISTtree.xpath('//div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div/a/@href')
    # print(link_list1)
    for i in link_list1:
        link='https://tieba.baidu.com'+i
        getIMG(link)




def getIMG(link):
    # //div[@class="p_content"]//img[@class="BDE_Image"]/@src
    # //div[@id="post_content_128181639806"]/img/@src
    kv = {'User-Agent': 'Mozilla/5.0'}
    r= requests.get(link,headers=kv)
    # with open('22.html','w',encoding="utf8") as TB_File:
    #     TB_File.write(r.text)

    img=r.text
    img = img.replace('<!--', '').replace('-->', '')
    listree=html.etree.HTML(img)
    img_links=listree.xpath('//div[@class="p_content  "]//img[@class="BDE_Image"]/@src')


    for link in img_links:
        try:
            print("正在下载:%s"%(link))
            download_img(link)

        except:
            print("出错正在跳过" )
            continue


def download_img(link):
    kv = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(link, headers=kv)
    pic_name='pics/'+link[link.rfind('/')+1:]
    with open(pic_name,'wb',) as f:
        f.write(r.content)

if __name__ == '__main__':
    keywords=input("请输入搜索关键字:")
    pages=int(input("请输入需要获得的页数:"))
    # timeSleep=int(input('请输入每次爬取的间隔时间:'))
    GetHtml(keywords,pages)

    # with open("1111.html" , encoding="utf8") as f:
    #     code = f.read()
    #     code = code.replace("<!--","").replace("-->","")
    #     codeTree = html.etree.HTML(code)
    #     list = codeTree.xpath('//div[@class="result-op c-container xpath-log"]/h3/a/text()')
    #     print(list)
发布了70 篇原创文章 · 获赞 14 · 访问量 2645

猜你喜欢

转载自blog.csdn.net/Captain_DUDU/article/details/102934243