#第7篇分享:python-手机爬虫fiddler-开启数据收集新时代(3)

#移动端爬虫介绍
1.移动端爬虫的思路,怎么爬取APP里面的内容
a.手机和电脑要通信,依靠 fiddler(相当于建立一个数据中转站);
b.访问网页的方式进行数据爬取;

2.fiddler及手机需要配置的东西:
a.下载并安装fiddler,电脑与手机在 同一网络下 ;
b.电脑端配置见下图:cmd->ipconfig可获得ip地址,用于后面手机端的配置:
ここに画像の説明を挿入
ここに画像の説明を挿入
c.手机端配置(抖音及快手抓取的时候会有反扒,配置完成后如果你想抓取他的网站,他会禁止你的网络,解决办法只能是电脑端下载手机模拟器,可以解决反爬:可能过一阵子又优化了):
#1.设置网络代理: 主机名: 电脑ip地址,不固定,随网络变化而变化;端口是fidder端口: 可修改(根据手机不同设置方式可能有区别,但记住只要这两个改了,就问题不大);
#2.手机下载证书(开放爬取权限): 浏览器输入网址:http://ip地址:端口号,手机浏览器打不开,电脑下载然后手动传到手机即可;

ここに画像の説明を挿入
ここに画像の説明を挿入
3.爬虫实例:今日头条动漫词条图片爬取scrapy:
ここに画像の説明を挿入
ここに画像の説明を挿入

目录:
ここに画像の説明を挿入

settings:

# -*- coding: utf-8 -*-

# Scrapy settings for images project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'images'

SPIDER_MODULES = ['images.spiders']
NEWSPIDER_MODULE = 'images.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'images (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
    
    
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
    
    
#    'images.middlewares.ImagesSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
    
    
#    'images.middlewares.ImagesDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
    
    
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
# #上面只是个访问header,加个降低被拒绝的保险
ITEM_PIPELINES = {
    
    
   'images.pipelines.ImagesPipeline': 300,
}
IMAGES_STORE ='D:\\python\\Scrapy\\image\\test'


#IMAGES_EXPIRES = 90
#IMAGES_MIN_HEIGHT = 100
#IMAGES_MIN_WIDTH = 100
#其中IMAGES_STORE是设置的是图片保存的路径。IMAGES_EXPIRES是设置的项目保存的最长时间。
# IMAGES_MIN_HEIGHT和IMAGES_MIN_WIDTH是设置的图片尺寸大小

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

items:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ImagesItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()
# image_urls和images是固定的,不能改名字

images_toutiao:

# -*- coding: utf-8 -*-
import scrapy
import re
from ..items import ImagesItem

class ImagesToutiaoSpider(scrapy.Spider):
    name = 'images_toutiao'
    allowed_domains = ['a3-ipv6.pstatp.com']
    start_urls = ['https://a3-ipv6.pstatp.com/article/content/25/1/6819945008301343243/6819945008301343243/1/0/0']  # 构造爬取的URL

    # 爬取图片ID:
#:https://a3-ipv6.pstatp.com/article/content/25/1/6819945008301343243/6819945008301343243/1/0/0
#https://a3-ipv6.pstatp.com/article/content/25/1/6848145029051974155/6848145029051974155/1/0/0
#https://a6-ipv6.pstatp.com/article/content/25/1/6848145029051974155/6848145029051974155/1/0/0
#https://a3-ipv6.pstatp.com/article/content/25/1/6848145029051974155/6848145029051974155/1/0/0        #找了三个链接,是基本相同的地址

    def parse(self, response):
        result = response.body.decode()  # 对start_urls获取的响应进行解码
        contents = re.findall(r'},{"url":"(.*?)"}', result)

        for i in range(0, len(contents)):
            if len(contents[i]) <= len(contents[0]):

                item = ImagesItem()
                list = []
                list.append(contents[i])
                item['image_urls'] = contents
                print(list)
                yield item
            else:
                pass
        #翻页-爬取多个页面的图片
        # self.page = [6819945008301343243/6819945008301343243/1/0/0,6819945008301343243/6819945008301343243/1/0/0,]
        # for i in self.page  #只爬前5页
        #     url="https://a3-ipv6.pstatp.com/article/content/25/1/"+str(self.page)
        #     yield scrapy.Request(url,callback=self.parse)

pipelines:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request

#这里的两个函数get_media_requests和item_completed都是scrapy的内置函数,想重命名的就这这里操作
#可以直接复制这里的代码就可以用了

class ImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield Request(image_url)

    def item_completed(self, results, item, info):
        image_path = [x['path'] for ok, x in results if ok]
        if not image_path:
            raise DropItem('Item contains no images')
        #item['image_paths'] = image_path
        return item

#     def file_path(self, request, response=None, info=None):
#         name = request.meta['name']    # 接收上面meta传递过来的图片名称
#         name = re.sub(r'[?\\*|“<>:/]', '', name)    # 过滤windows字符串,不经过这么一个步骤,你会发现有乱码或无法下载
#         filename= name +'.jpg'          #添加图片后缀名
#         return filename

今日の見出しアプリのクロールを完了しました。最初に連絡するのが難しい場合や、いくつかの問題が発生する場合がありますが、学習内容を十分に理解すると、ウェブでのクロールが構成の問題であることがわかります。それほど複雑ではありません。
最近、私はWeb開発テンプレートとブログサイトの開発について学んでいます。静的なWebページを作成できないため、進捗状況は非常に遅いですが、最近解決しました。インターネットで見つけて自分で変更しました。これも理解しました理由の1つは、物事を行うことの難しさがゆっくりと私たちの心に蓄積し、それが私たちがあきらめるほどに蓄積する可能性があるということですが、あなたが本当に突破するといつかがわかりますが、これは事実です。 ;
簡単な例を挙げましょう。運転することを学ぶほとんどの人はそれを経験しました。私たちが学ぶとき、私たちは私たちにとってすべてであると感じます。それを渡すことができない場合、私たちは人生が失敗したと感じます。自分の何が悪いのですか。ですから、ご不明な点がございましたら、お気軽にご連絡ください。
7番目の部分は、共有、継続的な更新
です。

おすすめ

転載: blog.csdn.net/weixin_46008828/article/details/108690179