python python3 爬虫模板、爬虫功能大全

欢迎补充、欢迎点赞~~~

#coding:utf-8
import time
import json
import random
import requests
import re
import os

class Spider:
    
    def __init__(self, main_url):
        self._main_url = main_url

    def user_agent_proxy(self):
        """ user_agent profxy 使用user_agent代理池
`
        return random user_agent
        """
        us = [('Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTM'
            'L, like Gecko) Chrome/71.0.3554.0 Safari/537.36'),
            'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
            'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
            'Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11'
        ]

        return random.choice(us)

    def main(self):
        """
        requests 
        """
        headers = {
            'Host':'',
            'Origin': '',
            'Cookie':'',
            'User-Agent':'',
            'Referer': '',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
        }
        params = {
            'page.currentPage': '',
            'page.perPageSize': '20',
        }
        for url in self._main_url:
            """
            #可选参数:
            url:页面地址,
            parmams:参数，
            headers：请求头,
            proxies：ip代理，
            auth=HTTPBasicAuth('username', 'password') 身份认证,
            verify=False：忽略ssl
            """
            r = requests.get(url, params=params, headers=headers)
            r = requests.post(url, data=params, headers=headers)
            res = r.text()
            res = r.json()
            self.bs4_work_html(res)

    def bs4_work_html(html):
        """
        use bs4
        """
        soup = BeautifulSoup(html,'lxml')

        soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
        tag = soup.b.string
        #tag['class'] #属性名
        #tag.attrs #属性
        soup.find_all('b')
        soup.find_all(re.compile("^b"))
        soup.find_all(["a", "b"])
        soup.find_all(True)
        
        """
        name：查找所有名字为 name 的 tag，字符串对象会被自动忽略掉。上面过滤器示例中的参数都是 name 参数。当然，其他参数中也可以使用过滤器。
    　　attrs：按属性名和值查找。传入字典，key 为属性名，value 为属性值。
    　　recursive：是否递归遍历所有子孙节点，默认 True。
    　　text：用于搜索字符串，会找到 .string 方法与 text 参数值相符的tag，通常配合正则表达式使用。也就是说，虽然参数名是 text，但实际上搜索的是 string 属性。
    　　limit：限定返回列表的最大个数。
    　　kwargs：如果一个指定名字的参数不是搜索内置的参数名，搜索时会把该参数当作 tag 的属性来搜索。这里注意，如果要按 class 属性搜索，因为 class 是 python 的保留字，需要写作 class_。
        """
        #find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
        print(data_soup.find_all(attrs={"data-foo": "value"}))
        """
        而按 class_ 查找时，只要一个CSS类名满足即可，如果写了多个CSS名称，那么顺序必须一致，而且不能跳跃。以下示例中，前三个可以查找到元素，后两个不可以。
        """
        print(css_soup.find_all("p", class_="strikeout"))
        #CSS选择器
        #通过class
        print(soup.select('.sister'))
        #通过id
        print(soup.select('#link1, #link2'))
        # 组合查找
        print(soup.select('p #link1'))

        
    def ip_proxys(self):
        """ ip profxy 返回ip代理池

        return ip dict
        """
        proxies = {
            'http':'http://10.10.10.10:1111',
            'https':'https://10.10.10.10:1111',
        }
        return proxies

    def download_file(self, file_url, file_name):
        """download_file 下载图片、视频、音频等文件
        
        """
        r = requests.get(file_url)
        if return_response_code(r):
            
            with open(file_name, 'wb') as f:
                f.write(r.content)

    def return_response_code(self, response):
        """
        return response True or False
        """
        code = response.status_code
        
        return (False if code >= 400 else True)

    def do_pdf(self, url, title):
        """ 将html下载成pdf
        work html 2 pdf download
        """
        import pdfkit #具体使用见另外一文
        confg = pdfkit.configuration(wkhtmltopdf='.\wkhtmltopdf.exe')        
        pdfkit.from_url(url, './%s/%s-%s-%s.pdf'%(Type, Type, title, str(int(time.time()))),configuration=confg)

        return True

    def jieba_fence(self, file_name, file_path):
        """利用jieba对爬虫结果进行分词，需要对文件进行操作
        print cut txt result
        """
        import jieba
        file_txt = open(os.path.join(file_path, file_name), "r",encoding='utf-8').read()
        words = jieba.cut(txt)     # 使用精确模式对文本进行分词
        counts = {}     # 通过键值对的形式存储词语及其出现的次数

        for word in words:
            if  len(word) == 1:    # 单个词语不计算在内
                continue
            else:
                counts[word] = counts.get(word, 0) + 1    # 遍历所有词语，每出现一次其对应的值加 1
        items = list(counts.items())#将键值对转换成列表
        items.sort(key=lambda x: x[1], reverse=True)    # 根据词语出现的次数进行从大到小排序

        for i in range(len(items)):
    
            word, count = items[i]
            print("{0:<5}{1:>5}".format(word, count))
python python3 爬虫模板、爬虫功能大全

欢迎补充、欢迎点赞~~~

猜你喜欢