爬虫常用技巧(1)-- 基本技能

面向对象编写爬虫

注意点

1 format 传参
2 生成 index 区分文件名
3 面向对象的思维,获取数据,保存数据,运行三部分分开
4 sys.argv 命令行传参

示例代码

import requests
import time
import sys


class Tieba(object):
    def __init__(self, name):
        self.name = name
        self.base_url = "http://tieba.baidu.com/f?ie=utf-8&kw={}&ie=utf-8&pn=".format(self.name)
        self.headers = {
            "User-Agent": 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
        }
        self.url_list = [self.base_url + str(i) for i in range(10)]

    def get_data(self, url):
        response = requests.get(url, headers=self.headers)
        return response.content

    # def save_data(self, content):
    #   filename = self.name + str(time.time()) + ".html"
    #   with open(filename,'w') as f:
    #       f.write(content)

    def save_data(self, content, index):
        filename = self.name + str(index) + '.html'
        with open(filename, 'wb') as f:
            f.write(content)

    def run(self):
        for url in self.url_list:
            data = self.get_data(url)
            index = self.url_list.index(url) + 1
            self.save_data(data, index)


if __name__ == '__main__':
    name = sys.argv[1]
    tieba = Tieba(name)
    tieba.run()

关于post的编解码问题

注意点

1 传递的data 参数是个字典
2 对content 首先进行解码 decode
3 对str 进行json.loads
4 从最后得到的字典中去取值

示例代码

import requests
import sys
import json

class Ciba(object):
    def __init__(self, word):
        self.word = word
        self.url = 'http://fy.iciba.com/ajax.php?a=fy'
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }

    def get_data(self):
        post_data = {
            'f':'auto',
            't':'auto',
            'w':self.word,
        }
        response = requests.post(self.url, data=post_data, headers=self.headers)
        return response.content

    def run(self):
        print(self.get_data())
        print(type(self.get_data()))

        res = self.get_data().decode()
        print(type(res))
        res2 = json.loads(res)
        print(res2.get('content').get('out'))

if __name__ == "__main__":
    word = sys.argv[1]
    ciba = Ciba(word)
    ciba.run()

使用代理

注意点

http 和 https 不能同时开启
付费代理的使用格式

示例代码

import requests

url = 'https://www.taobao.com'

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}

proxies = {
    'http':'http://183.163.40.223:31773',
    # 'http':'http://user:[email protected]:31773',
    'https':''

}

response = requests.get(url, proxies=proxies, headers=headers)

print(response.status_code)

有关cookie 和session

注意点

1 键值对的键名是 cookie 而不是 cookies
2 url 需要直接写到 访问私人信息的页面
3 cookie 的两种设置方式
4 正则匹配 re.findall 的用法 结果是返回一个列表

示例代码

import requests
import re
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'cookie':'anonymid=jawg9kd7k1zz09; _r01_=1; _ga=GA1.2.48173182.1525587736; __utma=151146938.48173182.1525587736.1526023590.'
              '1526023590.1; __utmz=151146938.1526023590.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/SysHome.do;'
              ' depovince=GW; jebecookies=dd54997e-141e-4db1-b3ac-5f7919c121a9|||||; JSESSIONID=abcb0QeuehygE7d76WGtw; ick_login='
              '48f4d146-3fe3-4806-a2a9-0762e7c742f9; _de=82D006EDB0340D0076B255B13038CCD8; p=0030030751d26caff8c1ffe172f1122d8; '
              'first_login_flag=1; ln_uact=15626046299; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=cb4639287e91a4a'
              '964a224ea50cd929f8; societyguester=cb4639287e91a4a964a224ea50cd929f8; id=965882188; xnsid=1f3be1d7; ver=7.0; '
              'loginfrom=null; wp_fold=0'

}
url = 'https://www.renren.com/965882188'
response = requests.get(url, headers=headers)
print(response.status_code)
print()

data = response.content.decode()
print(re.findall("新用户",data))
print(response.url)

示例代码2

# coding:utf-8
import requests
import re

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}

temp = 'anonymid=jawg9kd7k1zz09; _r01_=1; _ga=GA1.2.48173182.1525587736; __utma=151146938.48173182.1525587736.1526023590.1526023590.1; __utmz=151146938.1526023590.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/SysHome.do; depovince=GW; jebecookies=dd54997e-141e-4db1-b3ac-5f7919c121a9|||||; JSESSIONID=abcb0QeuehygE7d76WGtw; ick_login=48f4d146-3fe3-4806-a2a9-0762e7c742f9; _de=82D006EDB0340D0076B255B13038CCD8; p=0030030751d26caff8c1ffe172f1122d8; first_login_flag=1; ln_uact=15626046299; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=cb4639287e91a4a964a224ea50cd929f8; societyguester=cb4639287e91a4a964a224ea50cd929f8; id=965882188; xnsid=1f3be1d7; ver=7.0; loginfrom=null; wp_fold=0'

temp_list = temp.split('; ')

cookies = {}

for temp_ in temp_list:
    key = temp_.split("=",1)[0]
    value = temp_.split("=",1)[1]
    cookies[key] = value

url = 'https://www.renren.com/965882188'

response = requests.get(url, cookies=cookies, headers=headers)
print(response.status_code)

data = response.content.decode()
print(re.findall("新用户",data))

print(response.url)

猜你喜欢

转载自blog.csdn.net/Enjolras_fuu/article/details/81265217