Scrapy+Selenium---进阶用法

一、Scrapy

爬取某网页的音乐的各个信息

在所需的文件中所在命令窗口创建一个项目

步骤1、在items文件中写入需要的类

import scrapy
from scrapy import Field,Item

class TestwangyiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=Field()
    number=Field()
    music_name=Field()
    img_src=Field()
    music_src=Field()
    music_singer=Field()
    pass

步骤2、在spiders文件中新建文件写爬虫内容

from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from testwangyi.items import TestwangyiItem
import re
class music(CrawlSpider):
    name = 'wangyi'
    start_urls=['http://xyq.163.com/download/down_music.html']
    def parse(self, response):   #parse函数,默认是执行stat_urls的网址的全部内容
        # print(response.text)
        item=TestwangyiItem()    ######items中的类文件
        reg=re.compile('.*下载.*')
        selector=Selector(response)
        # 获取所有a
        # all_a=selector.xpath("//a[@download]").re(reg)
        # all_a = selector.xpath("//a[@download]").extract() #字符串列表
        all_a=selector.css('td.tTitle ::text').extract()
        #table下所有的A
        # all_a = selector.xpath("//table//tr").extract()
        for i in all_a:
            item['title'] = i
            yield item     #返回items运行的类
            # num=i.xpath('//td').extract()
            # print(num)

步骤3、在settings文件中设置代理

############头部代理
USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3)'\
' AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'

FEED_URI=u'file:///D:/pycharm2017.3.2/work/scrapy 0608/doubanTU/douban.csv'
#保存的文件位置和文件名字
FEED_FORMAT = 'CSV'
#保存的文件格式
# Obey robots.txt rules  #是否遵守协议
ROBOTSTXT_OBEY = True
'''DOWNLOAD_DELAY = 3          延迟显示##############'''
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}
#默认的请求头部
 Override the default request headers:
DEFAULT_REQUEST_HEADERS = { 

   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'en',
}
##########中间键
 Enable or disable spider middlewares        
 See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {                              
    'testwangyi.middlewares.TestwangyiSpiderMiddleware': 543,
}
 Enable or disable downloader middlewares ###########启用或停用中间键
 See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {  ############################下载器
    'testwangyi.middlewares.TestwangyiDownloaderMiddleware': 543,
}
# Configure item pipelines      ######开启保存文件模式
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'testwangyi.pipelines.TestwangyiPipeline': 300,
}

步骤4、middlewares中间器设置代理

在第二个class 中有diergprocess_response的def函数中添加代理ip和头部信息

from scrapy import signals
import random

class TestwangyiSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class TestwangyiDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):   #设置代理###################################################
        # Called with the response returned from the downloader.
        user_agent_list = []
        ######################################设置请求头部
        request.headers.setdefault('User-Agent',random.choice(user_agent_list))
        #设置代理IP
        ipdaili=[]
        request.bindaddress = random.choice(ipdaili)#绑定某一个地址
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

步骤5、pipelines存储爬取的内容

import openpyxl
class TestwangyiPipeline(object):
    wb=openpyxl.Workbook()
    ws=wb.active    #激活
    ws.append(['标题'])
    def process_item(self, item, spider):
        #设置变量存储list的形式的值,用来append()
        line=[item["title"]]
        self.ws.append(line)
        self.wb.save("music.xlsx")
        # print(item)
        return item

步骤6、main创建总函数运行

from scrapy import cmdline
cmdline.execute('scrapy crawl wangyi'.split())

二、Selenium

安装

安装的版本在http://chromedriver.storage.googleapis.com/index.html选择,下载之后,在浏览器所在的位置,直接复制进去。
pip install selenium

1、打开某个网页

from selenium import webdriver
url = 'https://www.baidu.com'
driver = webdriver.Chrome()
driver.get(url)

2、打开网页,获取某个信息

driver = webdriver.Chrome()
driver.find_elements_by_class_name()
driver.find_elements_by_tag_name()
driver.find_elements_by_id()
driver.switch_to.frame( id 或 者 name )

from selenium import webdriver
url="https://movie.douban.com/chart?qq-pf-to=pcqq.group"
driver=webdriver.Chrome()
# driver.maximize_window()
driver.get(url)

tables=driver.find_elements_by_tag_name("table")
tables.pop(0)
for i,v in enumerate(tables):
    # name=v.find_elements_by_class_name('pl2')[0].find_elements_by_tag_name("a")[0].text
    name=v.find_elements_by_class_name('pl2')[0].find_elements_by_class_name('pl')[0].text
    num=v.find_elements_by_class_name('pl2')[0].find_elements_by_class_name('pl')[1].text
    score = v.find_elements_by_class_name('pl2')[0].find_elements_by_class_name('pl')[0].text
    print(score)
    print(num)
# print(li_list)

3打开百度,定位搜索框搜索

from selenium import webdriver

url="http://www.baidu.com"
driver=webdriver.Chrome()
driver.maximize_window()    # 全屏
driver.get(url)
element=driver.find_element_by_id('kw')
element.send_keys("python")
driver.find_element_by_id("su").click()

4、打开网页,通过标签定位

from selenium import webdriver

driver=webdriver.Chrome()
driver.get('http://www.runoob.com/python3/python3-file-methods.html')
element = driver.find_element_by_tag_name("title")
target = driver.find_element_by_id("footer")

from selenium.webdriver import ActionChains
action_chains = ActionChains(driver)
action_chains.drag_and_drop(element, target).perform()

猜你喜欢

转载自blog.csdn.net/sakura55/article/details/80652445