The first reptile to do small projects, the code is very rough, and I just grabbed all the pictures set on the first page
can actually flip catch, just think too much trouble, learned it after then grab it, say so caught up waste of computer storage space, sister or something where there is a good view of actual combat, there is the time to see yellow chart, as tease sister, a girlfriend to talk about every day combat.
As of this nonsense:
Crawling site at: https://www.mzitu.com/tag/youhuo/
Let's open the site home page:
Many sister, looks good, do not pick me, give me what I have to.
Step by step, open development page:
Atlas Blue is the first part we want to extract the address below href were second third ......., as follows:
import requests
from requests.exceptions import RequestException
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
#请求头的这个Referer一定要加,妹子网有反爬,反正粘贴复制就行,多加几个信息无所谓
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
get_page(url)
if __name__ == '__main__':
main()
We can easily get the output:
This analysis of a resulting HTML, red box obviously we want to get the address of each atlas.
We cited "soup" to begin extracting Atlas Address:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
print(href)
# print(items)
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
BeautifulSoup library is very powerful, I decided to learn more, the output of soup a book list, it must first traverse to output *, enter the following:
We got the address of each atlas, open an atlas and see:
Each atlas details page, only a map, but below there is an index, then you need to consider the issue next page details page:
There are different Atlas same address arrangement, we will be able to create a circular list with pictures of each Atlas page for details:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
get_detail_page(href)
# print(items)
def get_detail_page(href):
for i in range(1,100):
detail_url = href + '/' + str(i)
if requests.get(detail_url, headers=headers).status_code == 200:
print(detail_url)
else:
print('已至末尾页')
return None
response = requests.get()
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
I explain a little, I set the cycle to 100, the picture only because some 46 Well, I added a page status code to determine if a 200 is normal URL, not the URL on the termination of 200 I get, so we each picture can be set within the site for all the details taken out of the pages:
The next step is to get through the Web site URL corresponding html:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
get_detail_page(href)
# print(items)
def get_detail_page(href):
for i in range(1,100):
detail_url = href + '/' + str(i)
if requests.get(detail_url, headers=headers).status_code == 200:
parse_detail_page(detail_url)
else:
print('已至末尾页')
return None
def parse_detail_page(detail_url):
try:
response = requests.get(detail_url, headers=headers)
if response.status_code == 200:
print('获取详情页成功')
detail_html = response.text
print(detail_html)
# get_image(detail_html)
return None
except RequestException:
print('获取详情页失败')
return None
# def get_image(detail_html):
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
Get details of each html page:
Beautiful Soup can then parse out:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
get_detail_page(href)
# print(items)
def get_detail_page(href):
for i in range(1,100):
detail_url = href + '/' + str(i)
if requests.get(detail_url, headers=headers).status_code == 200:
parse_detail_page(detail_url)
else:
print('已至末尾页')
return None
def parse_detail_page(detail_url):
try:
response = requests.get(detail_url, headers=headers)
if response.status_code == 200:
print('获取详情页成功')
detail_html = response.text
# print(detail_html)
get_image(detail_html)
return None
except RequestException:
print('获取详情页失败')
return None
def get_image(detail_html):
soup = BeautifulSoup(detail_html, 'lxml')
items= soup.select('.main-image')
# print(items)
for item in items:
return item.img['src']
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
Here is the preservation picture, here is the complete code, modular code is not high, post further improvements.
import requests
import os
from hashlib import md5
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
return None
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('#pins li')
for link in items:
href = link.a['href']
get_detail_page(href)
# print(items)
def get_detail_page(href):
for i in range(1,100):
detail_url = href + '/' + str(i)
if requests.get(detail_url, headers=headers).status_code == 200:
parse_detail_page(detail_url)
else:
print('已至末尾页')
return None
def parse_detail_page(detail_url):
try:
response = requests.get(detail_url, headers=headers)
if response.status_code == 200:
print('获取详情页成功')
detail_html = response.text
# print(detail_html)
get_image(detail_html)
return None
except RequestException:
print('获取详情页失败')
return None
def get_image(detail_html):
soup = BeautifulSoup(detail_html, 'lxml')
items= soup.select('.main-image')
# print(items)
for item in items:
image = item.img['src']
save_image(image)
def save_image(image):
response = requests.get(image,headers=headers)
if response.status_code == 200:
data = response.content
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(data).hexdigest(), 'jpg')
print(file_path)
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(data)
f.close()
print('保存成功')
else:
print('保存失败')
return None
def main():
url = 'https://www.mzitu.com/tag/youhuo/'
html = get_page(url)
parse_page(html)
if __name__ == '__main__':
main()
Tell the truth, I personally exclude YY, want to combat, ha ha ha