The second python data mining - reptiles

python reptile

urllib usage

EG1:
from the urllib Import Request
data request.urlopen = (the urlString) .read () # data is acquired for all of the source content page
data = data.decode ( "utf-8 ") # of data encoding
Import Re
PAT = ' <div class = "name"> </ div> (*.?)
RES = re.findall (PAT, the Data) # RES is a list of matching results
EG2:
request.urlretrieve (url, filename = LocalFileName) # specify the url web crawling to the filename
request.urlcleanup () # when using urlretrieve will produce a cache space, with urlcleanup can clear the cache
request.info () request.getcode () # access the return code request.geturl () # get currently visited page
timeout # seconds timeout limit
data = request.urlopen (urlString, timeout = 5) .read ()

Automatically simulate http request

import urllib.parse # 数据包
url="http://www.xxx.com"
data=urllib.parse.urlencode({
"name":"xuqiqiang",
"password":"heaoiwoe"
)}.encode("utf-8")
req = request.Request(url,data)
data = request.urlopen(req).read()
fh = open("D:\loadfile.html",'wb')
fh.write(data)
fh.close()
-----------------将请求结果保存完毕

Reptile abnormal

urllib.error
try:
...
except urllib.error as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)

Reptile camouflage browser technology

When crawling pages error code is returned to server 403 described the other crawler masked, which can not be directly crawlers crawl mode before, the browser needs disguised crawling.
= URL " http://www.xxx.com "
header = ( "User-- Agent", "...") # the second term is the first value of the
opener urllib.request.build_opener = ()
opener. addheaders = [header]

#第一种写法:
data = opener.open(url).read().decode("utf-8","ignore")
#第二种写法:
opener = urllib.request.build_opener()
opener.addheaders=[header]
urllib.request.instal_opener(opener)
data = urllib.request.urlopen(url).read().decode("utf-8","ignore")
fh=open(filepath,'wb')
fh.write(data)
fh.close()

News reptiles combat

import urllib.request
import re
data = urllib.request.urlopen("http://news.sina.com.cn").read()
data = data.decode("utf-8","ignore")
pat = 'href="(http://news.sina.com.cn/.*?)">'
all_url = re.findall(pat,data)
for i in range(len(res)):
 thisurl = all_url[i]
 file = "newsFile"+str(i)+".html"
 try:
  urllib.request.urlretrieve(thisurl,file)
 except urllib.error.URLError as e:
  if hasattr(e,"code"):
   print(e.code)
  if hasattr(e,"reason"):
   print(e.reason)

Anti reptile shield of proxy server

import urllib.request as rq
def use_proxy(url,proxy_addr):
 proxy = rq.ProxyHandler({"http":proxy_addr})
 opener = rq.build_opener(proxy,rq.HTTPHandler)
 rq.install_opener(opener)
 return rq.urlopen(url).read().decode("utf-8","ignore")

Taobao picture crawling

Guess you like

Origin www.cnblogs.com/xqqblog/p/12034490.html
Recommended