Python数据分析与挖掘

一、爬虫入门

　　网络爬虫（又被称为网页蜘蛛，网络机器人，在FOAF社区中间，更经常的称为网页追逐者），是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。

　　运用python3.6中的urllib.request

1.快速爬取一个网页　

（1）get请求方式　

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Author:Du Fei
import urllib.request
# keywd = "python"
keywd ="百度"
#解决中文编码问题
keywd=urllib.request.quote(keywd)

url = "http://www.baidu.com/s?wd=" +keywd
req =urllib.request.Request(url)
#urlopen将网页存到内存
data =urllib.request.urlopen(req).read()

fh=open("F:/python/data/douban/2.html","wb")
fh.write(data)
fh.close()

View Code

（2）post请求方式

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Author:Du Fei
#post请求
#登录模拟


import urllib.request
import urllib.parse

url ="http://www.iqianyue.com/mypost/"
#对字段相应设置
mydata=urllib.parse.urlencode({
    "name":"[email protected]",
    "pass":"123ssd"
}).encode("utf-8")
req =urllib.request.Request(url,mydata)
data =urllib.request.urlopen(req).read()
fh =open("F:/python/data/douban/2_1.html","wb")
fh.write(data)
fh.close()

View Code

2.模拟浏览器访问

　　应用场景：有些网页为了防止别人恶意采集其信息所以进行了一些反爬虫的设置，而我们又想进行爬取。
　　解决方法：设置一些Headers信息（User-Agent），模拟成浏览器去访问这些网站。

爬取淘宝高清图片

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Author:Du Fei
import urllib.request
import re
keyname="连衣裙"
#编码
key=urllib.request.quote(keyname)

#User-Agent    :Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0
#伪装成火狐浏览器
headers=("User-Agent","Mozilla /5.0 (Windows NT 10.0; Win64; x6;rv:60.0) Gecko/20100101 Firefox/60.0")
#创建opener对象
opener = urllib.request.build_opener()
#添加报头
opener.addheaders=[headers]
#将opener添加为全局
urllib.request.install_opener(opener)

for i in range(0,1):
    #构造网址
    url ="https://s.taobao.com/list?spm=a217m.8316598.313651-static.30.3f8533d5oZ7vEf&q="+key+"&cat=50344007&style=grid&seller_type=taobao&bcoffset=12&s=" +str(i*60)
    data = urllib.request.urlopen(url).read().decode("utf-8", "ingnore")

    #定义正则
    pat = 'pic_url":"//(.*?)"'
    #图片网址
    image_list=re.compile(pat).findall(data)
    print(image_list)
    for j in range(0,len(image_list)):
        thisimg = image_list[j]
        thisimg_url ="http://" +thisimg
        file="F:/python/data/douban/img/" +str(i)+str(j)+".jpg"
        urllib.request.urlretrieve(thisimg_url,filename=file)

View Code

爬取CSDN数据

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Author:Du Fei
import urllib.request
import re
url ="http://blog.csdn.net/"
#伪装成浏览器
#User-Agent用户代理
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
#创建opener对象
opener = urllib.request.build_opener()
#添加报头
opener.addheaders=[headers]
#将opener添加为全局
urllib.request.install_opener(opener)
#获取url数据
data =urllib.request.urlopen(url).read().decode("utf-8","ingnore")
pat ='<a href="(.*?)" target="_blank" data-track'
result=re.compile(pat).findall(data)
for i in range(0,len(result)):
    file = "F:/python/data/douban/csdn/" + str(i) + ".html"
    urllib.request.urlretrieve(result[i],filename=file)
    print("第"+str(i)+"爬取成功")

View Code

3.异常处理

　　爬虫在爬取网站上的数据常见的错误：URLError和HTTPError

　　脚本中加入异常处理机制使爬虫脚本更稳健。

爬取新浪新闻首页

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Author:Du Fei
"""
需求：将新浪新闻首页（http://news.sina.com.cn/）所有新闻都爬取到本地
思路：先爬首页，通过正则获取所有新闻链接，然后依次爬取新闻，并存储到本地
"""
import urllib.request
import urllib.error
import re
#获取首页
#urlopen将网页存到内存
data =urllib.request.urlopen("http://news.sina.com.cn/").read()
#获取的数据编码
data2=data.decode("utf-8","ignore")
pat ='<a href="(http://news.sina.com.cn/.*?)"'
allurl=re.compile(pat).findall(data2)
for i in range(0,len(allurl)):
    try:
        print("第"+str(i)+"次爬取")
        this_url=allurl[i]
        file="F:/python/data/douban/sinanews/" +str(i) + ".html"
        #网页下载到本地
        urllib.request.urlretrieve(this_url,file)
        print("---------------成功--------------")
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)

View Code

4.代理服务器

　　（1）使用代理服务器的一般格式

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Author:Du Fei
#西祠免费代理IP
#http://www.xicidaili.com/
#183.167.217.152    63000
import urllib.request
import urllib.error
#代理服务器
def use_proxy(url,proxy_addr):
    try:
        #代理IP
        proxy=urllib.request.ProxyHandler({"http":proxy_addr})
        #创建opener对象
        opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        #将opener添加为全局
        urllib.request.install_opener(opener)
        data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
        return data
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)


proxy_addr="221.228.17.172:8181"
url="http://www.baidu.com"
data =use_proxy(url,proxy_addr)
print(len(data))

View Code

　　（2）微信爬虫

　　所谓微信爬虫，及自动获取微信的相关文章信息的一种爬虫。微信对我们的限制是很多的，所以，我们需要采取一些手段解决这些限制，主要包括伪装浏览器、使用代理IP等方式

#http://weixin.sogou.com/
import re
import urllib.request
import time
import urllib.error
import urllib.request
#自定义函数，功能为使用代理服务器爬一个网址
def use_proxy(proxy_addr,url):
    #建立异常处理机制
    try:
        req=urllib.request.Request(url)
        # 添加报头
        req.add_header('User-Agent', 'Mozilla /5.0 (Windows NT 10.0; Win64; x6;rv:60.0) Gecko/20100101 Firefox/60.0')

        proxy= urllib.request.ProxyHandler({'http':proxy_addr})
        # 创建opener对象
        opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
        # 将opener添加为全局
        urllib.request.install_opener(opener)
        # 获取req数据
        data = urllib.request.urlopen(req).read()

        return data
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
        #若为URLError异常，延时10秒执行
        time.sleep(10)
    except Exception as e:
        print("exception:"+str(e))
        #若为Exception异常，延时1秒执行
        time.sleep(1)

#设置关键词            
key="Python"
#设置代理服务器，该代理服务器有可能失效，读者需要换成新的有效代理服务器
# proxy="127.0.0.1:8888"
proxy="221.228.17.172:8181"
#爬多少页
for i in range(0,10):
    key=urllib.request.quote(key)
    thispageurl="http://weixin.sogou.com/weixin?type=2&query="+key+"&page="+str(i)
    #a="http://blog.csdn.net"
    thispagedata=use_proxy(proxy,thispageurl)
    print(len(str(thispagedata)))
    pat1='<a href="(.*?)"'
    rs1=re.compile(pat1,re.S).findall(str(thispagedata))
    if(len(rs1)==0):
        print("此次（"+str(i)+"页）没成功")
        continue
    for  j in range(0,len(rs1)):
        thisurl=rs1[j]
        thisurl=thisurl.replace("amp;","")
        file="F:/python/data/weixin/第"+str(i)+"页第"+str(j)+"篇文章.html"
        thisdata=use_proxy(proxy,thisurl)
        try:
            fh=open(file,"wb")
            fh.write(thisdata)
            fh.close()
            print("第"+str(i)+"页第"+str(j)+"篇文章成功")
        except Exception as e:
            print(e)
            print("第"+str(i)+"页第"+str(j)+"篇文章失败")

View Code

5.多线程爬虫

　　多线程，即程序中的某些程序段并行执行，合理地设置多线程，可以让爬虫效率更高。

　　（1）普通爬虫（爬取糗事百科）

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Author:Du Fei
import urllib.request
import re
import urllib.error

headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)

for i in range(1,2):
    try:
        #https://www.qiushibaike.com/8hr/page/1/
        url="https://www.qiushibaike.com/8hr/page/"+str(i)
        pagedata=urllib.request.urlopen(url).read().decode("utf-8","ignore")
        #<div class="content"><span></span></div>
        pat ='<div class="content">.*?<span>(.*?)</span>.*?</div>'
        #可能有多行 re.S
        datalist=re.compile(pat,re.S).findall(pagedata)
        for j in range(0,len(datalist)):
            print("第"+str(i)+"页第"+str(j)+"个段子的内容是：")
            print(datalist[j])
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    except Exception as e:
        print(e)
        print("第" + str(i) + "页第" + str(j) + "篇文章失败")

View Code

　　 (2)多线程爬虫(爬取糗事百科)

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Author:Du Fei
import urllib.request
import re
import urllib.error
import threading

headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)

class One(threading.Thread):
    #初始化
    def __init__(self):
        #初始化线程
        threading.Thread.__init__(self)
    #线程要做的事情
    def run(self):
        #奇数页
        for i in range(1,36,2):
            try:
                # https://www.qiushibaike.com/8hr/page/1/
                url = "https://www.qiushibaike.com/8hr/page/" + str(i)
                pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
                # <div class="content"><span></span></div>
                pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
                # 可能有多行 re.S
                datalist = re.compile(pat, re.S).findall(pagedata)
                for j in range(0, len(datalist)):
                    print("第" + str(i) + "页第" + str(j) + "个段子的内容是：")
                    print(datalist[j])
            except urllib.error.URLError as e:
                if hasattr(e, "code"):
                    print(e.code)
                if hasattr(e, "reason"):
                    print(e.reason)


class Two(threading.Thread):
    #初始化
    def __init__(self):
        #初始化线程
        threading.Thread.__init__(self)
    #线程要做的事情
    def run(self):
        #偶数页
        for i in range(0,36,2):
            try:
                # https://www.qiushibaike.com/8hr/page/1/
                url = "https://www.qiushibaike.com/8hr/page/" + str(i)
                pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
                # <div class="content"><span></span></div>
                pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
                # 可能有多行 re.S
                datalist = re.compile(pat, re.S).findall(pagedata)
                for j in range(0, len(datalist)):
                    print("第" + str(i) + "页第" + str(j) + "个段子的内容是：")
                    print(datalist[j])
            except urllib.error.URLError as e:
                if hasattr(e, "code"):
                    print(e.code)
                if hasattr(e, "reason"):
                    print(e.reason)



one =One()
one.start()

two=Two()
two.start()

View Code

自主学习Python数据分析与数据挖掘中爬虫入门总结。

参考网上教程：https://www.hellobi.com/

Python数据分析与挖掘

猜你喜欢