python crawling - crawling little sister what you want

First, prepare

1. the original address

2. Check the html found that regular web page is the biggest picture of class for the pic-large

 

Second, the code

 1 import requests
 2 import os
 3 from bs4 import BeautifulSoup
 4 
 5 url = 'http://www.win4000.com/wallpaper_detail_157712.html'
 6 imgmkdir = 'D://Download//ghost_1//'
 7 
 8 
 9 # 获取网页url
10 def getUrlList():
11     imgUrlList = []
12     for i in range(0, 10):
13         imgUrl = ''
14         url_split = url.split('.html')
15         if not i == 0:
16             imgUrl += url_split[0] + '_' + str(i) + '.html'
17             # print(imgUrl)
18             imgUrlList.append(imgUrl)
19 
20     return imgUrlList
21 
22 
23 # 下载图片
24 def downImg(imgUrl):
25     try:
26         if not os.path.exists(imgmkdir):
27             os.mkdir (imgmkdir)
 28          IF  not os.path.exists (imgUrl):
 29              r = requests.get (imgUrl)
 30              r.raise_for_status ()
 31              # to use with their own statement can not manually close open file stream 
32              imgpath imgUrl.split imgmkdir + = ( ' / ' ) [-. 1 ]
 33 is              # begin writing the file, wb represents writing binary file 
34 is              with Open (imgpath, ' WB ' ) AS F:
 35                  f.write (r.content)
 36              Print (for imgUrl + ' [crawling Finish ' )
37 [          the else :
 38 is              Print (imgUrl.split ( ' / ' ) [-. 1] + ' [] file already exists ' )
 39      the except Exception AS E:
 40          Print ( " crawling failed " + STR (E))
 41 is  
42 is  
43 is  # Get imgHtml tag 
44 is  DEF getContent (Soup):
 45      for I in soup.find_all ( ' IMG ' , the class_ = ' PIC-Large ' ):
 46 is          imgsrc = I [ 'the src ' ]
 47          IF imgsrc.find ( ' HTTP ' )> = 0 or imgsrc.find ( ' HTTPS ' )> = 0:
 48              # Photo 
49              downImg (imgsrc)
 50  
51 is  
52 is  # Get html The source URL 
53 is  DEF getHtmlByUrl (htmlUrl):
 54 is      the htmlText = requests.get (htmlUrl) .content
 55      # use beautifulSoup parse HTML 
56 is      Soup = the beautifulSoup (the htmlText, ' lxml ' )
 57 is  
58     return soup
59 
60 
61 def main():
62     htmlUrlList = getUrlList()
63     for url in htmlUrlList:
64         htmltext = getHtmlByUrl(url)
65         getcontent(htmltext)
66 
67 
68 if __name__ == '__main__':
69     main()

Third, the results

IV Summary

  Stupid code with the methods to obtain, first test the water

Guess you like

Origin www.cnblogs.com/milicool/p/11262684.html