网页爬虫设计

项目驱动，需要从网站上爬取文章，并上传至服务器，实现模拟用户发帖。

框架采用Python3，配合爬虫框架Scrapy实现，目前只能抓取静态页，JS+Ajax动态加载的网页见下一篇博客

GitHub地址：https://github.com/JohonseZhang/Scrapy-Spider-based-on-Python3
求Star~

另外，爬取类似今日头条、淘宝、京东等动态加载网站的需要配合selenium和phantomjs框架：
[GitHub地址]：https://github.com/JohonseZhang/python3-scrapy-spider-phantomjs-selenium
求Star~求Star~求Star~

项目结构

代码结构图：
这里写图片描述

创建项目

进入指定文件夹，右击空白处>在此处打开命令行窗口
创建项目

Scrapy startproject DgSpider

主要代码文件说明

爬虫主类：UrlSpider.py、ContentSpider.py
项目包含2个爬虫主类，分别用于爬取文章列表页所有文章的URL、文章详情页具体内容
内容处理类：pipelines.py
处理内容
传输字段类：items.py
暂存爬取的数据
设置文件：settings.py
用于主要的参数配置
数据库操作：mysqlUtils.py
链接操作数据库

代码实现

UrlSpider.py

# -*- coding: utf-8 -*-

import scrapy
from DgSpider.items import DgspiderUrlItem
from scrapy.selector import Selector
from DgSpider import urlSettings


class DgUrlSpider(scrapy.Spider):
    print('Spider DgUrlSpider Staring...')

    # 爬虫名 必须静态指定
    # name = urlSettings.SPIDER_NAME
    name = 'DgUrlSpider'

    # 设定域名
    allowed_domains = [urlSettings.DOMAIN]

    # 爬取地址
    url_list = []
    """一般来说，列表页第一页不符合规则，单独append"""
    url_list.append(urlSettings.START_LIST_URL)
    loop = urlSettings.LIST_URL_RULER_LOOP
    for i in range(1, loop):
        url = urlSettings.LIST_URL_RULER_PREFIX + str(i) + urlSettings.LIST_URL_RULER_SUFFIX
        url_list.append(url)
    start_urls = url_list

    # 爬取方法
    def parse(self, response):

        # sel : 页面源代码
        sel = Selector(response)

        item_url = DgspiderUrlItem()
        url_item = []

        # XPATH获取url
        url_list = sel.xpath(urlSettings.POST_URL_XPATH).extract()

        # 消除http前缀差异
        for url in url_list:
            url = url.replace('http:', '')
            url_item.append('http:' + url)

        # list去重
        url_item = list(set(url_item))
        item_url['url'] = url_item

        yield item_url

ContentSpider.py

# -*- coding: utf-8 -*-

import scrapy
from DgSpider.mysqlUtils import dbhandle_geturl
from DgSpider.items import DgspiderPostItem
from scrapy.selector import Selector
from scrapy.http import Request
from DgSpider import contentSettings
from DgSpider import urlSettings
from DgSpider.mysqlUtils import dbhandle_update_status


class DgContentSpider(scrapy.Spider):
    print('Spider DgContentSpider Staring...')

    result = dbhandle_geturl(urlSettings.GROUP_ID)

    url = result[0]
    spider_name = result[1]
    site = result[2]
    gid = result[3]
    module = result[4]

    # 爬虫名 必须静态指定
    # name = contentSettings.SPIDER_NAME
    name = 'DgContentSpider'

    # 设定爬取域名范围
    allowed_domains = [site]

    # 爬取地址
    # start_urls = ['http://www.mama.cn/baby/art/20140829/774422.html']
    start_urls = [url]

    start_urls_tmp = []
    """构造分页序列，一般来说遵循规则 url.html,url_2.html,url_3.html，并且url.html也写为url_1.html"""
    for i in range(6, 1, -1):
        start_single = url[:-5]
        start_urls_tmp.append(start_single+"_"+str(i)+".html")

    # 更新状态
    """对于爬去网页，无论是否爬取成功都将设置status为1，避免死循环"""
    dbhandle_update_status(url, 1)

    # 爬取方法
    def parse(self, response):
        item = DgspiderPostItem()

        # sel : 页面源代码
        sel = Selector(response)

        item['url'] = DgContentSpider.url

        # 对于title, <div><h1><span aaa><span>标题1</h1></div>,使用下列方法取得
        data_title_tmp = sel.xpath(contentSettings.POST_TITLE_XPATH)
        item['title'] = data_title_tmp.xpath('string(.)').extract()

        item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract()

        yield item

        if self.start_urls_tmp:
            url = self.start_urls_tmp.pop()
            yield Request(url, callback=self.parse)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
# If you have many piplelines, all should be init here
# and use IF to judge them
#
# DOUGUO Spider pipelines
# @author zhangjianfei
# @date 2017/04/13

import re
import urllib.request
from DgSpider import urlSettings
from DgSpider import contentSettings
from DgSpider.mysqlUtils import dbhandle_insert_content
from DgSpider.uploadUtils import uploadImage
from DgSpider.mysqlUtils import dbhandle_online
from DgSpider.mysqlUtils import dbhandle_update_status
from bs4 import BeautifulSoup
from DgSpider.PostHandle import post_handel
from DgSpider.commonUtils import get_random_user
from DgSpider.commonUtils import get_linkmd5id


class DgPipeline(object):
    # post构造reply
    cs = []

    # 帖子title
    title = ''

    # 帖子文本
    text = ''

    # 当前爬取的url
    url = ''

    # 随机用户ID
    user_id = ''

    # 图片flag
    has_img = 0

    # get title flag
    get_title_flag = 0

    def __init__(self):
        DgPipeline.user_id = get_random_user(contentSettings.CREATE_POST_USER)

    # process the data
    def process_item(self, item, spider):
        self.get_title_flag += 1

        # pipeline for content
        if spider.name == contentSettings.SPIDER_NAME:

            # 获取当前网页url
            DgPipeline.url = item['url']

            # 获取post title
            if len(item['title']) == 0:
                title_tmp = ''
            else:
                title_tmp = item['title'][0]

            # 替换标题中可能会引起 sql syntax 的符号
            # 对于分页的文章，只取得第一页的标题
            if self.get_title_flag == 1:

                # 使用beautifulSoup格什化标题
                soup_title = BeautifulSoup(title_tmp, "lxml")
                title = ''
                # 对于bs之后的html树形结构，不使用.prettify()，对于bs, prettify后每一个标签自动换行，造成多个、
                # 多行的空格、换行，使用stripped_strings获取文本
                for string in soup_title.stripped_strings:
                    title += string

                title = title.replace("'", "”").replace('"', '“')
                DgPipeline.title = title

            # 获取正post内容
            if len(item['text']) == 0:
                text_temp = ''
            else:
                text_temp = item['text'][0]

            # 获取图片
            reg_img = re.compile(r'<img.*>')
            imgs = reg_img.findall(text_temp)
            for img in imgs:
                DgPipeline.has_img = 1

                # matchObj = re.search('.*src="(.*)"{2}.*', img, re.M | re.I)
                match_obj = re.search('.*src="(.*)".*', img, re.M | re.I)
                img_url_tmp = match_obj.group(1)

                # 去除所有Http:标签
                img_url_tmp = img_url_tmp.replace("http:", "")

                # 对于<img src="http://a.jpg" title="a.jpg">这种情况单独处理
                imgUrl_tmp_list = img_url_tmp.split('"')
                img_url_tmp = imgUrl_tmp_list[0]

                # 加入http
                imgUrl = 'http:' + img_url_tmp

                list_name = imgUrl.split('/')
                file_name = list_name[len(list_name)-1]

                # if os.path.exists(settings.IMAGES_STORE):
                #     os.makedirs(settings.IMAGES_STORE)

                # 获取图片本地存储路径
                file_path = contentSettings.IMAGES_STORE + file_name
                # 获取图片并上传至本地
                urllib.request.urlretrieve(imgUrl, file_path)
                upload_img_result_json = uploadImage(file_path, 'image/jpeg', DgPipeline.user_id)
                # 获取上传之后返回的服务器图片路径、宽、高
                img_u = upload_img_result_json['result']['image_url']
                img_w = upload_img_result_json['result']['w']
                img_h = upload_img_result_json['result']['h']
                img_upload_flag = str(img_u)+';'+str(img_w)+';'+str(img_h)

                # 在图片前后插入字符标记
                text_temp = text_temp.replace(img, '[dgimg]' + img_upload_flag + '[/dgimg]')

            # 使用beautifulSoup格什化HTML
            soup = BeautifulSoup(text_temp, "lxml")
            text = ''
            # 对于bs之后的html树形结构，不使用.prettify()，对于bs, prettify后每一个标签自动换行，造成多个、
            # 多行的空格、换行
            for string in soup.stripped_strings:
                text += string + '\n'

            # 替换因为双引号为中文双引号，避免 mysql syntax
            DgPipeline.text = self.text + text.replace('"', '“')

            # 对于分页的文章，每一页之间加入换行
            # DgPipeline.text += (DgPipeline.text + '\n')

        # pipeline for url
        elif spider.name == urlSettings.SPIDER_NAME:
            db_object = dbhandle_online()
            cursor = db_object.cursor()

            for url in item['url']:
                linkmd5id = get_linkmd5id(url)
                spider_name = contentSettings.SPIDER_NAME
                site = urlSettings.DOMAIN
                gid = urlSettings.GROUP_ID
                module = urlSettings.MODULE
                status = '0'
                sql_search = 'select md5_url from dg_spider.dg_spider_post where md5_url="%s"' % linkmd5id
                sql = 'insert into dg_spider.dg_spider_post(md5_url, url, spider_name, site, gid, module, status) ' \
                      'values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' \
                      % (linkmd5id, url, spider_name, site, gid, module, status)
                try:
                    # 判断url是否存在,如果不存在，则插入
                    cursor.execute(sql_search)
                    result_search = cursor.fetchone()
                    if result_search is None or result_search[0].strip() == '':
                        cursor.execute(sql)
                        result = cursor.fetchone()
                        db_object.commit()
                except Exception as e:
                    print(">>> catch exception !")
                    print(e)
                    db_object.rollback()

        return item

    # spider开启时被调用
    def open_spider(self, spider):
        pass

    # sipder 关闭时被调用
    def close_spider(self, spider):
        if spider.name == contentSettings.SPIDER_NAME:
            # 数据入库：235
            url = DgPipeline.url
            title = DgPipeline.title
            content = DgPipeline.text
            user_id = DgPipeline.user_id
            dbhandle_insert_content(url, title, content, user_id, DgPipeline.has_img)

            # 更新status状态为1（已经爬取过内容）
            """此项已在spider启动时设置"""
            # dbhandle_update_status(url, 1)

            # 处理文本、设置status、上传至dgCommunity.dg_post
            # 如果判断has_img为1，那么上传帖子
            if DgPipeline.has_img == 1:
                if title.strip() != '' and content.strip() != '':
                    spider.logger.info('has_img=1,title and content is not null! Uploading post into db...')
                    post_handel(url)
                else:
                    spider.logger.info('has_img=1,but title or content is null! ready to exit...')
                pass
            else:
                spider.logger.info('has_img=0, changing status and ready to exit...')
                pass

        elif spider.name == urlSettings.SPIDER_NAME:
            pass

items.py

# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# douguo Spider Item
# @author zhangjianfei
# @date 2017/04/07
import scrapy

class DgspiderUrlItem(scrapy.Item):
    url = scrapy.Field()

class DgspiderPostItem(scrapy.Item):
    url = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field()

settings.py
这个文件只需要更改或加上特定的配置项

BOT_NAME = 'DgSpider'

SPIDER_MODULES = ['DgSpider.spiders']
NEWSPIDER_MODULE = 'DgSpider.spiders'

# 注册PIPELINES
ITEM_PIPELINES = {
    'DgSpider.pipelines.DgPipeline': 1
}

mysqlUtils.py

import pymysql
import pymysql.cursors
import os


def dbhandle_online():
    host = '192.168.1.235'
    user = 'root'
    passwd = 'douguo2015'
    charset = 'utf8'
    conn = pymysql.connect(
        host=host,
        user=user,
        passwd=passwd,
        charset=charset,
        use_unicode=False
    )
    return conn


def dbhandle_local():
    host = '192.168.1.235'
    user = 'root'
    passwd = 'douguo2015'
    charset = 'utf8'
    conn = pymysql.connect(
        host=host,
        user=user,
        passwd=passwd,
        charset=charset,
        use_unicode=True
        # use_unicode=False
    )
    return conn


def dbhandle_geturl(gid):
    host = '192.168.1.235'
    user = 'root'
    passwd = 'douguo2015'
    charset = 'utf8'
    conn = pymysql.connect(
        host=host,
        user=user,
        passwd=passwd,
        charset=charset,
        use_unicode=False
    )
    cursor = conn.cursor()
    sql = 'select url,spider_name,site,gid,module from dg_spider.dg_spider_post where status=0 and gid=%s limit 1' % gid
    try:
        cursor.execute(sql)
        result = cursor.fetchone()
        conn.commit()
    except Exception as e:
        print("***** exception")
        print(e)
        conn.rollback()

    if result is None:
        os._exit(0)
    else:
        url = result[0]
        spider_name = result[1]
        site = result[2]
        gid = result[3]
        module = result[4]
        return url.decode(), spider_name.decode(), site.decode(), gid.decode(), module.decode()


def dbhandle_insert_content(url, title, content, user_id, has_img):
    host = '192.168.1.235'
    user = 'root'
    passwd = 'douguo2015'
    charset = 'utf8'
    conn = pymysql.connect(
        host=host,
        user=user,
        passwd=passwd,
        charset=charset,
        use_unicode=False
    )
    cur = conn.cursor()

    # 如果标题或者内容为空，那么程序将退出，篇文章将会作废并将status设置为1，爬虫继续向下运行获得新的URl
    if content.strip() == '' or title.strip() == '':
        sql_fail = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' % ('1', url)
        try:
            cur.execute(sql_fail)
            result = cur.fetchone()
            conn.commit()
        except Exception as e:
            print(e)
            conn.rollback()
        os._exit(0)

    sql = 'update dg_spider.dg_spider_post set title="%s",content="%s",user_id="%s",has_img="%s" where url="%s" ' \
          % (title, content, user_id, has_img, url)

    try:
        cur.execute(sql)
        result = cur.fetchone()
        conn.commit()
    except Exception as e:
        print(e)
        conn.rollback()
    return result


def dbhandle_update_status(url, status):
    host = '192.168.1.235'
    user = 'root'
    passwd = 'douguo2015'
    charset = 'utf8'
    conn = pymysql.connect(
        host=host,
        user=user,
        passwd=passwd,
        charset=charset,
        use_unicode=False
    )
    cur = conn.cursor()
    sql = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' \
          % (status, url)
    try:
        cur.execute(sql)
        result = cur.fetchone()
        conn.commit()
    except Exception as e:
        print(e)
        conn.rollback()
    return result


def dbhandle_get_content(url):
    host = '192.168.1.235'
    user = 'root'
    passwd = 'douguo2015'
    charset = 'utf8'
    conn = pymysql.connect(
        host=host,
        user=user,
        passwd=passwd,
        charset=charset,
        use_unicode=False
    )
    cursor = conn.cursor()
    sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=1 and url="%s" limit 1' % url
    try:
        cursor.execute(sql)
        result = cursor.fetchone()
        conn.commit()
    except Exception as e:
        print("***** exception")
        print(e)
        conn.rollback()

    if result is None:
        os._exit(1)

    title = result[0]
    content = result[1]
    user_id = result[2]
    gid = result[3]
    return title.decode(), content.decode(), user_id.decode(), gid.decode()


# 获取爬虫初始化参数
def dbhandle_get_spider_param(url):
    host = '192.168.1.235'
    user = 'root'
    passwd = 'douguo2015'
    charset = 'utf8'
    conn = pymysql.connect(
        host=host,
        user=user,
        passwd=passwd,
        charset=charset,
        use_unicode=False
    )
    cursor = conn.cursor()
    sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=0 and url="%s" limit 1' % url
    result = ''
    try:
        cursor.execute(sql)
        result = cursor.fetchone()
        conn.commit()
    except Exception as e:
        print("***** exception")
        print(e)
        conn.rollback()
    title = result[0]
    content = result[1]
    user_id = result[2]
    gid = result[3]
    return title.decode(), content.decode(), user_id.decode(), gid.decode()

一些特别的常亮及参数，也是用py文件加入
urlSettings.py:

# 爬取域名
DOMAIN = 'eastlady.cn'

# 爬虫名
""" URL爬虫模块名，不可变 """
SPIDER_NAME = 'DgUrlSpider'

GROUP_ID = '33'

MODULE = '999'

# 文章列表页起始爬取URL
START_LIST_URL = 'http://www.eastlady.cn/emotion/pxgx/1.html'

# 文章列表循环规则
LIST_URL_RULER_PREFIX = 'http://www.eastlady.cn/emotion/pxgx/'
LIST_URL_RULER_SUFFIX = '.html'
LIST_URL_RULER_LOOP = 30

# 文章URL爬取规则XPATH
POST_URL_XPATH = '//div[@class="article_list"]/ul/li/span[1]/a[last()]/@href'

contentSetting:

# -*- coding: utf-8 -*-

# Scrapy settings for DgSpider project

# 图片储存
IMAGES_STORE = 'D:\\pics\\jfss\\'

# 爬取域名
DOMAIN = 'nrsfh.com'

# 图片域名前缀
DOMAIN_HTTP = "http:"

# 随机发帖用户
CREATE_POST_USER = '37619,18441390'

# 爬虫名
SPIDER_NAME = 'DgContentSpider'

# 文章URL爬取规则XPATH
POST_TITLE_XPATH = '//div[@class="title"]'
POST_CONTENT_XPATH = '//div[@class="bodycss"]'

启动爬虫

进入爬虫代码所在的文件夹，右击：在此打开命令行窗口，先执行：

Scrapy crawl UrlSpider

进行爬取所有的URL，并入库
再执行：

Scrapy crawl ContentSpider

从数据库中读取URL，抓取网页内容，入库

当然，也可以洗衣歌windos批处理脚本，持续不断的执行Scrapy crawl ContentSpider：

@echo DOUGUO window Spider
cd D:\Scrapy\DgSpider
for /l %%i in (1,1,7000) do scrapy crawl DgContentSpider
:end
@echo SUCCESS! PRESS ANAY KEY TO EXIT! 
@Pause>nul

当然，这种方式比较笨拙，最好还是启用cmdline，加入多线程，这里不说明

处理完上面的所有步骤，就能成功地抓取到网页数据：
这里写图片描述

Python3+Scrapy实现网页爬虫