Automobile Reptile House (regular expressions)

# ! / Usr / bin / the env Python 
# - * - Coding: UTF-. 8 - * - 
# Requests: to download the page source, equivalents, the urlopen () 
# Beautiful Soup, parsing html, replace the regular portion Re 
# the Html 
# . BeautifulSoup () the Find ( "a") 
Import Requests
 Import BS4
 from BS4 Import BeautifulSoup 

# to get the car home home page source code 
# urlopen (url) .read.decode 
main_page_content = requests.get ( " HTTPS: // the WWW. autohome.com.cn/weifang/ " ) .text
 # the page source code to bs4 resolve 
main_page = BeautifulSoup (main_page_content, " html.parser " )
# Can be positioned tag 
main_div = main_page.find (name = " div " , attrs = { " class " : " people-Content " }) 
main_ul = main_div.find (name = " UL " , attrs = { " class " : " list-text " }) 
main_a_lst = main_ul.find_all ( " a " ) # this is a list, a list of tags 
n-=. 1 for a in main_a_lst:
     # acquires attribute from a tag 
    one_page_url = "
https:"+a.get("href")
    print("====>",one_page_url)
    one_page_content=requests.get(one_page_url).text
    one_page_content=one_page_content.replace("</br>","")#替换掉所有的</br>
    one_page=BeautifulSoup(one_page_content,"html.parser")
    img_lst=one_page.find("div",attrs={"class":"journey-item-list" .}) Find_all ( " img " )
     for img in img_lst:
         # get a picture of the src 
        DOWNLOAD_URL = img.get ( " the Data-Original " )
         IF  not DOWNLOAD_URL: 
            DOWNLOAD_URL = img.get ( " src " )
         Print ( DOWNLOAD_URL)
         # download pictures 
        f = Open ( " img / car home picture s.jpg% " % the n-, the MODE = " wb " )  
        f.write (requests.get (DOWNLOAD_URL) .content)
        f.close () 
        the n- = the n-1 +
        Print ( " Your success from the car home and stole a picture " )

 

Guess you like

Origin www.cnblogs.com/tengteng0520/p/11275530.html