python 爬取百度贴吧美图和文字

import urllib.request
from lxml import etree
 
headers = ('Referer','https://tieba.baidu.com/p/4640092720?pn=1')#防盗链，修改访问来源
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)

for i in range(1,14):
    url="https://tieba.baidu.com/p/4640092720?pn="+str(i)
    response = opener.open(url).read().decode("utf-8","ignore")
    html = etree.HTML(response) 
    imagelist = html.xpath('///img[@class="BDE_Image"]/@src')
    print(len(imagelist))
    for j in range(0,len(imagelist)):
            thisimg=imagelist[j]
            thisimgurl=thisimg
            file="D://python//baidu_bl/"+str(i)+str(j)+".jpg"
            urllib.request.urlretrieve(thisimgurl,filename=file)
    textlist=html.xpath('//div[@class="d_post_content j_d_post_content "]/text()')
    print(len(textlist))
    for k in range(0,len(textlist)):
            data=textlist[k]
            fh1=open("D:/python/baidu_bl/baidu_bl.doc","a")
            fh1.write(data+'\n')
    fh1.close()

多线程如下：

import urllib.request
import threading
from lxml import etree


headers = ('Referer','https://tieba.baidu.com/p/4640092720?pn=1')#防盗链，修改访问来源
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)


class Img(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
    def run(self):
        for i in range(1,3):
            url="https://tieba.baidu.com/p/4640092720?pn="+str(i)
            response = opener.open(url).read().decode("utf-8","ignore")
            html = etree.HTML(response) 
            imagelist = html.xpath('///img[@class="BDE_Image"]/@src')
            print(len(imagelist))
            for j in range(0,len(imagelist)):
                    thisimg=imagelist[j]
                    thisimgurl=thisimg
                    file="D://python//baidu_bl/"+str(i)+str(j)+".jpg"
                    urllib.request.urlretrieve(thisimgurl,filename=file)
class Text(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
    def run(self):
        for i in range(1,3):
            url="https://tieba.baidu.com/p/4640092720?pn="+str(i)
            response = opener.open(url).read().decode("utf-8","ignore")
            html = etree.HTML(response)
            textlist=html.xpath('//div[@class="d_post_content j_d_post_content "]/text()')
            for k in range(0,len(textlist)):
                data=textlist[k]
                fh1=open("D:/python/baidu_bl/baidu_bl.doc","a")
                fh1.write(data+'\n')
            fh1.close()

t1=Img()#赋值线程
t1.start()#开启线程
t2=Text()
t2.start()

python 爬取百度贴吧美图和文字

猜你喜欢