python 多线程爬取百度图片

最近学习python,磕磕碰碰写了一个根据关键字使用多线程爬取百度图片的程序。代码如下:

import os
import urllib.request
import threading
import re

#获取本地的要搜索关键字的名单
def getNameList(filename):
    #没加encoding='utf-8'导致UnicodeDecodeError: 'gbk' codec can't decode byte 0x8f in position 14: illegal multibyte sequence
    file = open(filename,encoding='utf-8')
    keywordlist = file.readlines()
    for i in range(len(keywordlist)):
        keywordlist[i] = keywordlist[i].replace('\n','').strip()
    print(keywordlist)
    return keywordlist

def getHtml(url):
    #没加.decode('utf-8')导致TypeError: cannot use a string pattern on a bytes-like object
    outhtml = urllib.request.urlopen(url).read().decode('utf-8')
    #print (outhtml)
    return outhtml
#pn是页码
def getUrl(url,word,pn):
    #urlopen不能处理中文字符,因此需要将中文转码
    word = urllib.parse.quote(word)
    surl = url.format(word=word,pn=pn)
    #print(surl)
    return surl

def getUrlsFromHtml(html):
    reg = '"thumbURL":"(.*?)"'
    #imgre是(.*?)匹配到的东西,此处是图片链接
    imgre = re.compile(reg,re.S)
    imglist = re.findall(reg,html)
    #print(imglist)
    return imglist

def downLoadImg(urls,path,keyword,downloadnum,picnumPerKeyword):
    index = downloadnum
    for url in urls:
        print("下载:",url)
        try:
            filename = os.path.join(path, keyword + str(index) + ".jpg")
            urllib.request.urlretrieve(url, filename)
            index += 1
            if(index>=picnumPerKeyword):
                break;
        except Exception as e:
            print("There is an Exception!")
            index -= 1
    return index

def imgDwonloadThread(url,keyword,picnumPerKeyword,savepatn):
    pn = 0
    downloadnum = 0
    print('下载:', keyword)
    while downloadnum < picnumPerKeyword:
        newurl = getUrl(url, keyword, pn)
        html = getHtml(newurl)
        # 得到图片的链接
        urls = getUrlsFromHtml(html)
        downloadnum = downLoadImg(urls, savepatn, keyword,downloadnum,picnumPerKeyword)
        pn += 30

def main():
    url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1514256634296_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word={word}&cg=girl&rn=60&pn={pn}'
    keywordfile = "d://keyword"
    savepatn = "D://pythonDownLoad/"
    keywordlist = getNameList(keywordfile)
    picnumPerKeyword = 100
    for keyword in keywordlist:
        #新开一个线程
        try:
            thread = threading.Thread(target=imgDwonloadThread,args=(url,keyword,picnumPerKeyword,savepatn))
            thread.start()
        except:
            print("创建新线程失败!")

main()

d://keyword 文件中存放关键字,每行存放一个,每个关键字使用独立的线程爬取图片

猜你喜欢

转载自blog.csdn.net/luvalluo/article/details/78902989