Today continue to crawl a site, http://www.27270.com/ent/meinvtupian/
the site has anti-climb, so we downloaded code is not very place where some processing, we focus on learning ideas, suggestions can tell me what's in place comment.
For future network request operation direction, we simply perform the packaging operations of some code.
Here you can go to install a named retrying
module
pip install retrying
The specific use of this module, go to Baidu own right. Hey ~ da
Here I used a method of randomly generated USER_AGENT
import requests
from retrying import retry
import random
import datetime
class R:
def __init__(self,method="get",params=None,headers=None,cookies=None):
# do something
def get_headers(self):
user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UserAgent = random.choice(user_agent_list)
headers = {'User-Agent': UserAgent}
return headers
#other code
Python资源分享qun 784758214 ,内有安装包,PDF,学习视频,这里是Python学习者的聚集地,零基础,进阶,都欢迎
retrying
The easiest is to use the method you want to keep retrying plus decorator @retry
After here, I would like to request the network module to try three times in error!
At the same time R类
add some necessary initialization parameter method, you can see the following code directly
__retrying_requests
A method as private methods, according to which get
adverbial clause: post
logical judgment mode
import requests
from retrying import retry
import random
import datetime
class R:
def __init__(self,method="get",params=None,headers=None,cookies=None):
#do something
def get_headers(self):
# do something
@retry(stop_max_attempt_number=3)
def __retrying_requests(self,url):
if self.__method == "get":
response = requests.get(url,headers=self.__headers,cookies=self.__cookies,timeout=3)
else:
response = requests.post(url,params=self.__params,headers=self.__headers,cookies=self.__cookies,timeout=3)
return response.content
# other code
The method has been declared complete network requests, view reports and response.content
data flow
This method is based on the following private, add a text acquiring network and a method of acquiring a network file. Perfect synchronization class initialization method, discovered in development, we want to crawling | web page encoded in English gb2312
so it needs to increase to a certain method编码参数
import requests
from retrying import retry
import random
import datetime
class R:
# 类的初始化方法
def __init__(self,method="get",params=None,headers=None,cookies=None):
self.__method = method
myheaders = self.get_headers()
if headers is not None:
myheaders.update(headers)
self.__headers = myheaders
self.__cookies = cookies
self.__params = params
def get_headers(self):
# do something
@retry(stop_max_attempt_number=3)
def __retrying_requests(self,url):
# do something
# get请求
def get_content(self,url,charset="utf-8"):
try:
html_str = self.__retrying_requests(url).decode(charset)
except:
html_str = None
return html_str
def get_file(self,file_url):
try:
file = self.__retrying_requests(file_url)
except:
file = None
return file
This, that R类
we have been perfected, complete code, you should put together from above, you can also turn directly Finally, surface, direct access on the go on github.
Next, it is the more important part of the code reptiles. This time, we can simply use what classes and objects, and add a simple multithreaded operations.
First, create a ³³ ImageList
class, the first thing we need to get the total number of pages crawled pages
This step is relatively simple
- Get page source
- Last match regular element
- Extract the digital
import http_help as hh # 这个http_help 是我上面写到的那个R类
import re
import threading
import time
import os
import requests
# 获取所有待爬取的URL列表
class ImageList():
def __init__(self):
self.__start = "http://www.27270.com/ent/meinvtupian/list_11_{}.html" # URL模板
# 头文件
self.__headers = {"Referer":"http://www.27270.com/ent/meinvtupian/",
"Host":"www.27270.com"
}
self.__res = hh.R(headers=self.__headers) # 初始化访问请求
def run(self):
page_count = int(self.get_page_count())
if page_count==0:
return
urls = [self.__start.format(i) for i in range(1,page_count)]
return urls
# 正则表达式匹配末页,分析页码
def get_page_count(self):
# 注意这个地方需要传入编码
content = self.__res.get_content(self.__start.format("1"),"gb2312")
pattern = re.compile("<li><a href='list_11_(\d+?).html' target='_self'>末页</a></li>")
search_text = pattern.search(content)
if search_text is not None:
count = search_text.group(1)
return count
else:
return 0
if __name__ == '__main__':
img = ImageList()
urls = img.run()
Note that in the above code get_page_count
, the method has obtained the end of the page
In our run
internal methods, by a List Builder
urls = [self.__start.format(i) for i in range(1,page_count)]
Batch put all the links are to be crawling generation is completed.
27270 Picture ---- climb above analysis to get a list of URL, capture details page
We use the model of producers and consumers, is a link to grab pictures, a download pictures, multi-threaded manner, we need first to introduce
import threading
import time
The complete code is as follows
import http_help as hh
import re
import threading
import time
import os
import requests
urls_lock = threading.Lock() #url操作锁
imgs_lock = threading.Lock() #图片操作锁
imgs_start_urls = []
class Product(threading.Thread):
# 类的初始化方法
def __init__(self,urls):
threading.Thread.__init__(self)
self.__urls = urls
self.__headers = {"Referer":"http://www.27270.com/ent/meinvtupian/",
"Host":"www.27270.com"
}
self.__res = hh.R(headers=self.__headers)
# 链接抓取失败之后重新加入urls列表中
def add_fail_url(self,url):
print("{}该URL抓取失败".format(url))
global urls_lock
if urls_lock.acquire():
self.__urls.insert(0, url)
urls_lock.release() # 解锁
# 线程主要方法
def run(self):
print("*"*100)
while True:
global urls_lock,imgs_start_urls
if len(self.__urls)>0:
if urls_lock.acquire(): # 锁定
last_url = self.__urls.pop() # 获取urls里面最后一个url,并且删除
urls_lock.release() # 解锁
print("正在操作{}".format(last_url))
content = self.__res.get_content(last_url,"gb2312") # 页面注意编码是gb2312其他格式报错
if content is not None:
html = self.get_page_list(content)
if len(html) == 0:
self.add_fail_url(last_url)
else:
if imgs_lock.acquire():
imgs_start_urls.extend(html) # 爬取到图片之后,把他放在待下载的图片列表里面
imgs_lock.release()
time.sleep(5)
else:
self.add_fail_url(last_url)
else:
print("所有链接已经运行完毕")
break
def get_page_list(self,content):
# 正则表达式
pattern = re.compile('<li> <a href="(.*?)" title="(.*?)" class="MMPic" target="_blank">.*?</li>')
list_page = re.findall(pattern, content)
return list_page
Python资源分享qun 784758214 ,内有安装包,PDF,学习视频,这里是Python学习者的聚集地,零基础,进阶,都欢迎
The above code more important are
threading.Lock () using the lock, the operation of global variables between multiple threads, the need for timely lock;
other note of the contents, which I have added in the comments, as long as you follow the steps in little bit of writing, and added some of their own subtle understanding can handle that.
Until now, we have to crawl to address all of the pictures, he deposited me in a global variable inside imgs_start_urls
so now again
This list is stored inside http://www.27270.com/ent/meinvtupian/2018/298392.html
this address, when you open this page, you will find only one picture, and the following have a page.
After clicking tab, know the law
http://www.27270.com/ent/meinvtupian/2018/298392.html
http://www.27270.com/ent/meinvtupian/2018/298392_2.html
http://www.27270.com/ent/meinvtupian/2018/298392_3.html
http://www.27270.com/ent/meinvtupian/2018/298392_4.html
....
When you make multiple attempts, you will find links can rely on the back of the mosaic is complete, if not this page, then he will show?
Well, if you had the above operations, you should know then how to achieve it!
I put all the code directly attached below, or use the comment way for everyone to put the most important places marked out
class Consumer(threading.Thread):
# 初始化
def __init__(self):
threading.Thread.__init__(self)
self.__headers = {"Referer": "http://www.27270.com/ent/meinvtupian/",
"Host": "www.27270.com"}
self.__res = hh.R(headers=self.__headers)
# 图片下载方法
def download_img(self,filder,img_down_url,filename):
file_path = "./downs/{}".format(filder)
# 判断目录是否存在,存在创建
if not os.path.exists(file_path):
os.mkdir(file_path) # 创建目录
if os.path.exists("./downs/{}/{}".format(filder,filename)):
return
else:
try:
# 这个地方host设置是个坑,因为图片为了防止盗链,存放在另一个服务器上面
img = requests.get(img_down_url,headers={"Host":"t2.hddhhn.com"},timeout=3)
except Exception as e:
print(e)
print("{}写入图片".format(img_down_url))
try:
# 图片写入不在赘述
with open("./downs/{}/{}".format(filder,filename),"wb+") as f:
f.write(img.content)
except Exception as e:
print(e)
return
def run(self):
while True:
global imgs_start_urls,imgs_lock
if len(imgs_start_urls)>0:
if imgs_lock.acquire(): # 锁定
img_url = imgs_start_urls[0] #获取到链接之后
del imgs_start_urls[0] # 删掉第0项
imgs_lock.release() # 解锁
else:
continue
# http://www.27270.com/ent/meinvtupian/2018/295631_1.html
#print("图片开始下载")
img_url = img_url[0]
start_index = 1
base_url = img_url[0:img_url.rindex(".")] # 字符串可以当成列表进行切片操作
while True:
img_url ="{}_{}.html".format(base_url,start_index) # url拼接
content = self.__res.get_content(img_url,charset="gbk") # 这个地方获取内容,采用了gbk编码
if content is not None:
pattern = re.compile('<div class="articleV4Body" id="picBody">[\s\S.]*?img alt="(.*?)".*? src="(.*?)" />')
# 匹配图片,匹配不到就代表本次操作已经完毕
img_down_url = pattern.search(content) # 获取到了图片地址
if img_down_url is not None:
filder = img_down_url.group(1)
img_down_url = img_down_url.group(2)
filename = img_down_url[img_down_url.rindex("/")+1:]
self.download_img(filder,img_down_url,filename) #下载图片
else:
print("-"*100)
print(content)
break # 终止循环体
else:
print("{}链接加载失败".format(img_url))
if imgs_lock.acquire(): # 锁定
imgs_start_urls.append(img_url)
imgs_lock.release() # 解锁
start_index+=1 # 上文描述中,这个地方需要不断进行+1操作
All of the code above, the key areas I try to add a label, you can look thin, really do not understand, you knock a few times, because there is no particularly complex place, many of which were logical.
Finally, attach the main part of the code, let our code up and running
if __name__ == '__main__':
img = ImageList()
urls = img.run()
for i in range(1,2):
p = Product(urls)
p.start()
for i in range(1,2):
c = Consumer()
c.start()
Python资源分享qun 784758214 ,内有安装包,PDF,学习视频,这里是Python学习者的聚集地,零基础,进阶,都欢迎
After a while, slowly drawing it close