五、简单爬虫示例

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/s294878304/article/details/102752495

一、爬取下厨房网站首页图片

# encoding: utf-8

"""
@author: sunxianpeng
@file: 58spider.py
@time: 2019/10/25 19:19
"""
import os
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from urllib.parse import urlparse
class Main():
    def __init__(self):
        pass

    def reqest_url(self, url):
        try:
            response = requests.get(url)
        except RequestException as e:
            print("request is error!", e)
        return response

    def get_img_labels(self, imgs):
        img_list = []
        for img in imgs:
            # 判断img是否包含data-src属性
            if img.has_attr('data-src'):
                # 包含则取data-src属性
                img_list.append(img.attrs['data-src'])
            else:
                # 不包含则取src属性
                img_list.append(img.attrs["src"])
        return img_list

    def dir_judge_or_create(self, dir_path):
        if not os.path.isdir(dir_path):
            os.mkdir(dir_path)

    def save_img(self, img_req, img_path):
        with open(img_path, "wb") as f:
            # 每次写入1024字节
            for chunk in img_req.iter_content(1024):
                f.write(chunk)


if __name__ == '__main__':
    m = Main()
    url = r"http://www.xiachufang.com/"
    html_content = m.reqest_url(url).text
    soup = BeautifulSoup(html_content)
    imgs = soup.select('img')# 选取所有的img图片标签
    img_list = m.get_img_labels()# 获取包含图片url的属性内容
    img_dir = os.path.join(os.curdir, 
                           "E:\PythonProjects\python_study\python_requests\spider\data\imges")
    m.dir_judge_or_create(img_dir)

    for img_url in img_list:
        o = urlparse(img_url)
        # 从url中取出图片名字
        img_name = o.path[1:].split('@')[0]
        # 图片存储路径
        img_path = os.path.join(img_dir, img_name)
        # 有些图片中还包含一层目录,需要创建对应dir,防止报错
        m.dir_judge_or_create(os.path.dirname(img_path))
        # 构建 图片url 路径
        u = '%s://%s/%s' % (o.scheme, o.netloc, img_name)
        # 二进制格式的图片
        print(u)
        img_req = m.reqest_url(u)
        m.save_img(img_req, img_path)

二、爬取迁木网示例

1、单线程

# encoding: utf-8

"""
@author: sunxianpeng
@file: qianmu_spider.py
@time: 2019/10/26 13:32
"""
import requests
from requests.exceptions import RequestException
import lxml
from lxml import etree

class Main():
    def __init__(self):
        pass
    def reqest_url(self,url):
        headers = {'user-agent':
                       'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/67.0.3396.62 Safari/537.36'}
        # print("headers = ", headers)
        response = None
        try:
            response = requests.get(url)
            # body = response.text  # 获取网页内容
        except RequestException as e:
            print("request is error!", e)
        return response

    def get_selector(self,html_content):
        selector = None
        try:
            selector = etree.HTML(html_content)
        except Exception as e:
            print("get selector is error!", e)
        return selector

    def analyze_html(self,selector):
        data = {}
        keys = []
        values = []
        data["collage_name"] = selector.xpath('//div[@id="wikiContent"]/h1/text()')[0]
        # 处理单元格内有换行
        table = selector.xpath('//div[@id="wikiContent"]/div[@class="infobox"]//table')
        if table :
            table = table[0]
            cols_k = table.xpath('.//td[1]')
            cols_v = table.xpath('.//td[2]')
            for j in range(len(cols_k)):
                col_k = cols_k[j]
                col_v = cols_v[j]
                keys.append(''.join(col_k.xpath('./p//text()')))
                values.append(''.join(col_v.xpath('./p//text()')))
            # 合并两个列表组成字典,将zip后得到的字典 添加到data字典中
            data.update(zip(keys, values))
        return data

    def process_entrance(self,selector):
        data = {}
        links = selector.xpath('//tbody//tr[@height=19][position()>1]/td/a/@href')
        for i in range(len(links)):
            link = str(links[i])
            if not link.startswith("http://www.qianmu.org"):
                link = "http://www.qianmu.org/%s" % link
            selector = self.get_selector(self.reqest_url(link).text)
            try:
                data = self.analyze_html(selector)
                print(data)
            except Exception as e:
                # 此处可以查看相对应的信息,解决表格非标准的形式问题,本次就不处理,直接跳过
                print(link)
                continue
        return data

if __name__ == '__main__':
    m = Main()
    url = "http://www.qianmu.org/ranking/1528.htm"
    req = m.reqest_url(url)
    selector = m.get_selector(m.reqest_url(url).text)
    data = m.process_entrance(selector)

二、多线程

# encoding: utf-8

"""
@author: sunxianpeng
@file: qianmu_spider.py
@time: 2019/10/26 13:32
"""
import requests
from requests.exceptions import RequestException
from lxml import etree
import threading
from queue import Queue
import time

class Main():
    def __init__(self):
        pass
    def reqest_url(self,url):
        headers = {'user-agent':
                       'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/67.0.3396.62 Safari/537.36'}
        # print("headers = ", headers)
        response = None
        try:
            response = requests.get(url)
            # body = response.text  # 获取网页内容
        except RequestException as e:
            print("request is error!", e)
        return response

    def get_selector(self,html_content):
        selector = None
        try:
            selector = etree.HTML(html_content)
        except Exception as e:
            print("get selector is error!", e)
        return selector

    def analyze_html(self,selector):
        data = {}
        keys = []
        values = []
        data["collage_name"] = selector.xpath('//div[@id="wikiContent"]/h1/text()')[0]
        # 处理单元格内有换行
        table = selector.xpath('//div[@id="wikiContent"]/div[@class="infobox"]//table')
        if table :
            table = table[0]
            cols_k = table.xpath('.//td[1]')
            cols_v = table.xpath('.//td[2]')
            for j in range(len(cols_k)):
                col_k = cols_k[j]
                col_v = cols_v[j]
                keys.append(''.join(col_k.xpath('./p//text()')))
                values.append(''.join(col_v.xpath('./p//text()')))
            # 合并两个列表组成字典,将zip后得到的字典 添加到data字典中
            data.update(zip(keys, values))
        return data

    def download(self,link_queue):
        """ """
        while True:
            # 阻塞,直到从队列取到一个链接
            link = link_queue.get()
            # 取不出链接,或者说取出的是None
            if link is None:
                break
            if not link.startswith("http://www.qianmu.org"):
                link = "http://www.qianmu.org/%s" % link
            try:
                selector = self.get_selector(self.reqest_url(link).text)
                data = self.analyze_html(selector)
                print(data)
            except Exception as e:
                # 此处可以查看相对应的信息,解决表格非标准的形式问题,本次就不处理,直接跳过
                print(link)
                continue
            link_queue.task_done()
            print('remaining queue: %s',link_queue.qsize())


if __name__ == '__main__':
    start_time = time.time()
    m = Main()
    url = "http://www.qianmu.org/ranking/1528.htm"
    link_queue =  Queue()
    req = m.reqest_url(url)
    selector = m.get_selector(m.reqest_url(url).text)
    links = selector.xpath('//tbody//tr[@height=19][position()>1]/td/a/@href')
    for i in range(len(links)):
        link = str(links[i])
        link_queue.put(link)
    # 多线程
    threads = []
    thread_num = 10
    # 启动线程,并将线程对象放入一个列表保存
    for i in range(thread_num):
        t = threading.Thread(target=m.download(link_queue))
        t.start()
        threads.append(t)
    #阻塞队列,直到队列被清空,此时线程未退出
    link_queue.join()
    # 向队列发送n个None,来通知线程退出
    for i in range(thread_num):
        link_queue.put(None)
    # 退出线程
    for t in threads:
        # 堵塞主线程,直到所有的线程退出
        t.join()

    used_time = time.time() - start_time
    print("download finished !!, used time : %s" % used_time)

三、整合Redis,简单分布式

# encoding: utf-8

"""
@author: sunxianpeng
@file: qianmu_spider.py
@time: 2019/10/26 13:32
"""
import requests
from requests.exceptions import RequestException
from lxml import etree
import threading
from queue import Queue
import time
from redis import Redis
import signal
class Main():
    def __init__(self):
        pass
    def reqest_url(self,url):
        headers = {'user-agent':
                       'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/67.0.3396.62 Safari/537.36'}
        # print("headers = ", headers)
        response = None
        try:
            response = requests.get(url)
            # body = response.text  # 获取网页内容
        except RequestException as e:
            print("request is error!", e)
        return response

    def get_selector(self,html_content):
        selector = None
        try:
            selector = etree.HTML(html_content)
        except Exception as e:
            print("get selector is error!", e)
        return selector

    def analyze_html(self,selector):
        data = {}
        keys = []
        values = []
        data["collage_name"] = selector.xpath('//div[@id="wikiContent"]/h1/text()')[0]
        # 处理单元格内有换行
        table = selector.xpath('//div[@id="wikiContent"]/div[@class="infobox"]//table')
        if table :
            table = table[0]
            cols_k = table.xpath('.//td[1]')
            cols_v = table.xpath('.//td[2]')
            for j in range(len(cols_k)):
                col_k = cols_k[j]
                col_v = cols_v[j]
                keys.append(''.join(col_k.xpath('./p//text()')))
                values.append(''.join(col_v.xpath('./p//text()')))
            # 合并两个列表组成字典,将zip后得到的字典 添加到data字典中
            data.update(zip(keys, values))
        return data

    def download(self,r):
        """ """
        while thread_on:
            # 阻塞,直到从队列取到一个链接
            link = r.lpop("qianmu.queue")
            if link:
                if not link.startswith("http://www.qianmu.org"):
                    link = "http://www.qianmu.org/%s" % link
                try:
                    selector = self.get_selector(self.reqest_url(link).text)
                    data = self.analyze_html(selector)
                    print(data)
                except Exception as e:
                    # 此处可以查看相对应的信息,解决表格非标准的形式问题,本次就不处理,直接跳过
                    print(link)
                    continue
                print('remaining queue: %s', r.llen("qianmu.queue"))
            time.sleep(0.2)
        print("Thread-%s exit now" % i  )


    # def sigint_handler(self,signum,frame):
    def sigint_handler(self):
        print("received Ctrl+C, wait for exit gracefully !!")
        global thread_on
        thread_on = False

if __name__ == '__main__':
    start_time = time.time()
    m = Main()
    r = Redis()

    url = "http://www.qianmu.org/ranking/1528.htm"

    req = m.reqest_url(url)
    selector = m.get_selector(m.reqest_url(url).text)
    links = selector.xpath('//tbody//tr[@height=19][position()>1]/td/a/@href')
    for i in range(len(links)):
        link = str(links[i])
        # 判断当前link是否已经抓取过,没有则放入队列
        if r.sadd("qianmu.ifexists",link):
            r.rpush("qianmu.queue",link)
    # 多线程
    threads = []
    thread_num = 10
    thread_on = True # 线程是否开启
    # 启动线程,并将线程对象放入一个列表保存
    for i in range(thread_num):
        t = threading.Thread(target=m.download(r), args=(i+1,))
        t.start()
        threads.append(t)
    signal.signal(signal.SIGINT,m.sigint_handler)
    # 退出线程
    for t in threads:
        # 堵塞主线程,直到所有的线程退出
        t.join()

    used_time = time.time() - start_time
    print("download finished !!, used time : %s" % used_time)

猜你喜欢

转载自blog.csdn.net/s294878304/article/details/102752495