Python crawler.3. Download web images

Goal, Douban reading,

Download page book images.

import urllib.request
import re #Use regular expressions


def getJpg(date):
    jpgList = re.findall(r'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")',date)
    return jpgList

def downLoad(jpgUrl,sTitle,n):
    try:  
        urllib.request.urlretrieve(jpgUrl,\
            'C:\\Users\\74172\\source\\repos\\Python\\spidertest1\\images\\book.douban\\%s.jpg'  %sTitle)
    except Exception as e:  
        print (s)  
    finally:  
        print('Picture %s download operation completed' % n)  

def getTitle(date):
    titleList = re.findall(r'title=".">',date)
    return titleList


if __name__ == '__main__':     
    url = 'https://book.douban.com/'
    res = urllib.request.urlopen(url)   
    date = res.read().decode('utf-8')
    date_jpg = getJpg(date)
    imageTitle = getTitle(date)
    overall n
    n = 1                      
    for jpginfo in date_jpg:
        s = re.findall(r'http.+?.jpg',str(jpginfo))
        print(n,'--- url -->',str(s)[2:-2])
        sTitleInfo = re.findall(r'alt=".+?."',str(jpginfo))
        sTitleL = re.findall(r'".+?."',str(sTitleInfo))
        sTitle = str (sTitleL) [3: -3]
        downLoad(s[0],sTitle,n)
        n = n + 1
        

 I made some modifications and wrote the title to the txt file

import urllib.request
import re #Use regular expressions


def getJpg(html):
    jpgList = re.findall(r'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")',html)
    jpgList = re.findall(r'http.+?.jpg',str(jpgList))
    return jpgList

def downLoad(jpgUrl,sTitle,n):
    try:  
        urllib.request.urlretrieve(jpgUrl,\
            'C:/Users/74172/source/repos/Python/spidertest1/images/book.douban/%s.jpg'  %sTitle)
    finally:  
        print('Picture---%s----Download operation completed' %sTitle)  

def getTitle(html):
    titleList = re.findall(r'(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")',html)
    titleList = re.findall(r'alt=".+?."',str(titleList))
    titleList = re.findall(r'".+?."',str(titleList))
    return titleList

def writeTxt(imageTitle):
    try:
        #Create a txt file in the directory
        f = open((url[8:-5]+'.txt'),"a",encoding="utf-8")
        #write
        f.write(imageTitle+'\n')
    finally:
        if f:
            #close file
            f.close()

if __name__ == '__main__':     
    url = 'https://book.douban.com/'
    res = urllib.request.urlopen(url)   
    html = res.read().decode('utf-8')
    urlJpgs = getJpg(html)
    imageTitle = getTitle(html)
    n = 0                      
    for urlJpg in urlJpgs:
        print(n,'--- url -->',urlJpg)
        downLoad(urlJpg,imageTitle[n][1:-1],n)
        writeTxt(imageTitle[n][1:-1])
        n = n + 1
   

 

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=327073302&siteId=291194637