#coding='utf-8'
import requests
import re
from bs4 import BeautifulSoup
import urllib
import time
from lxml import etree
heads={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Cookie':'cityDomain=bj; uuid=35da5ae3-0a1d-4523-8105-98400a2d0996; antipas=57286K032121117f543W88696U; ganji_uuid=5535224847520343723852; sessionid=4210d228-3e67-4ac2-aeba-e43dbcee1c29; lg=1; close_finance_popup=2018-07-14; clueSourceCode=10103188612%2300; cainfo=%7B%22ca_s%22%3A%22sem_baiduss%22%2C%22ca_n%22%3A%22bdpc_sye%22%2C%22ca_i%22%3A%22-%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22%25E4%25BA%258C%25E6%2589%258B%25E8%25BD%25A6%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22%25e4%25ba%258c%25e6%2589%258b%25e8%25bd%25a6%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%2249886960919%22%2C%22scode%22%3A%2210103188612%22%2C%22ca_transid%22%3Anull%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%2235da5ae3-0a1d-4523-8105-98400a2d0996%22%2C%22sessionid%22%3A%224210d228-3e67-4ac2-aeba-e43dbcee1c29%22%7D; preTime=%7B%22last%22%3A1531578197%2C%22this%22%3A1531574022%2C%22pre%22%3A1531574022%7D'
}
def get_url():
data= requests.get("https://www.guazi.com/bj/",headers=heads).text
time.sleep(4)
selector=etree.HTML(data)
#print selector
result=[]
imgs = selector.xpath("//li/a/img")
for i in imgs:
print i.get('src')
result.append(i.get('src'))
return list(set(result))
def save_pic(imgs_url,path):
m=0
for img_url in imgs_url:
print('***** ' +' Downloading...')
urllib.urlretrieve(img_url, path + str(m)+'.jpg')
m = m + 1
print('Download complete!')
save_pic(get_url(),"d:\\")
先上了代码,urllib.urlretrieve的方法用来将爬取的图片url下载到相应的文件夹中,真是好方便!
于是分为两个函数,第一个用来爬取图片的url,第二个用来下载图片;
第一个函数我用的是lxml的etree,可以往资源解析成xml格式,才能用xpath,因为本人觉得xpath是最好用的定位方法~~~
imgs列表中的每个元素的type是lxml.etree._element,不是文本,i.get("src")可以提取相应的属性~
第二个函数就是直接下载了,没有什么可说的~有问题欢迎提问