And the presence of an object class data table ;;

Classes and Basic syntax:

# coding:utf-8

# 面向过程的编程语言,例如c
# python 是面向对象的编程语言,封装 继承 多态
# 类:具有相同属性和行为方法的抽象的集合
# 对象 :类的具体的实例化

# class: 群体 班级 理解为类的意思
# object python 中的基类 祖宗类
class People(object):
    # 初始化函数,在创建对象时,通过初始化函数给对象属性赋值
    # init函数会在创建对象时,自动调用
    def __init__(self,sex,age,height,weight):
        # 给对象属性赋值
        self.sex = sex
        self.age = age
        self.height = height
        self.weight = weight

    # 定义对象功能函数
    # 对象函数,只有对象才能调用
    def eat(self):
        # self 哪个对象调用这个函数,在函数中self指的就是这个对象
        # 对象A调用,self就是对象A 对象B调用self指的就是对象B
        print self
    def run(self,time):
        print '%s跑了%s分钟'%(self.name,time)

# 创建一个People类的对象
# 对象名 = 类名(属性值1,属性值2,...)
p1 = People('男',0.83,30,8.8)
# 获取对象属性
# 变量 = 对象.属性名
sex = p1.sex
age = p1.age
height = p1.height
weight =p1.weight
print sex,age,height,weight


# 修改对象属性值
p1.age = 18
print p1.age
# 添加属性
p1.name = '张三'
p1.run(2)


# 动态给对象添加属性
# 对象.属性名 = 属性值
# p1.name = '张三'
# print p1.name

How to store data in a table

# coding:utf-8
# 打开cmd 命令行工具(windows+r 输入cmd 回车)
# 输入pip install xlwt 下载xlwt包
import xlwt

# 创建一个工作簿 encoding='utf-8' 指定为utf-8编码,可以写入中文
workbook = xlwt.Workbook(encoding='utf-8')
# 新增一张表,用来存储数据
sheet = workbook.add_sheet(u'智联招聘python职位表')
# 向表中填写数据
# 1.行号 2.列号 3.要写入的数据
sheet.write(0,0,'职位名称')
sheet.write(0,1,'公司名称')

# 保存工作簿文件
workbook.save(u'智联招聘python职位表.xls')

Examples of
a class of objects and thought Zhaopin crawling, the data stored in the table

# coding: utf-8
import requests
import re
import xlwt

# 智联招聘的爬虫类
class ZLZP(object):

    '''
    http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京%2B上海%2B广州%2B深圳%2B杭州&kw=python
    '''

    def __init__(self):
        self.html = ''
        self.nextLink =''
        self.filename = ''
        # 准备请求头
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
        self.result_data = []

    # 根据url获取html源代码
    def get_html(self):
        # 携带请求头
        response = requests.get(self.nextLink, headers=self.headers)
        self.html = response.content

    # 获取搜索职位首页的url地址
    def get_index_url(self):
        # 把kw添加为对象的属性
        self.kw = raw_input('请输入要爬取的职位名称:')
        city_list = []
        # 输入查询的城市,最多输入五个
        while len(city_list) < 5:
            city = raw_input('请输入要查询的城市(最多5个,输入0结束:):')
            if city == '0':
                break
            city_list.append(city)
        # 可以把列表中的每一条数据使用字符串隔开,最终返回一个字符串
        citys = '%2B'.join(city_list)
        # 拼接第一页的url地址,并赋值给nextLink
        self.nextLink = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%s&kw=%s' % (citys, self.kw)

    # 获取下一页的链接
    def get_nextLink(self):
        pattern = re.compile(r'<li class="pagesDown.*?<a.*?href="(.*?)" class="next-page">',re.S)
        rs = re.search(pattern, self.html)
        # 如果搜索到结果,说明有下一页的链接
        if rs:
            self.nextLink = rs.group(1)
        else:
            self.nextLink = None

    # 获取所有页的数据
    def get_result_data(self):
        # 只要有下一页的链接,执行获取数据
        count = 0
        while self.nextLink:
            count += 1
            print '正在爬取第%s页数据.....'%count
            # 获取html源代码
            self.get_html()
            # 准备正则
            pattern = re.compile(r'<table.*?class="newlist.*?<td class="zwmc.*?<a.*?>(.*?)</a>.*?<td class="gsmc.*?<a.*?>(.*?)</a>.*?<td class="zwyx">(.*?)</td>.*?<td class="gzdd">(.*?)</td>',re.S)
            # 根据正则查找数据
            rs = re.findall(pattern, self.html)
            # 把所有的数据添加到大列表中
            self.result_data.append(rs)
            # 获取下一页链接
            self.get_nextLink()

    # 保存数据
    def save_data(self):
        # 创建工作簿
        workbook = xlwt.Workbook(encoding='utf-8')
        # kw如果是中文需要进行编码
        sheet = workbook.add_sheet(u'%s职位表'%unicode(self.kw, 'utf-8'))
        sheet.write(0, 0, '职位名称')
        sheet.write(0, 1, '公司名称')
        sheet.write(0, 2, '职位月薪(最低)')
        sheet.write(0, 3, '职位月薪(最高)')
        sheet.write(0, 4, '工作地点')
        # 从1行开始写入
        count = 1
        # for循环遍历所有页数据
        for rs in self.result_data:
            for detail in rs:
                # 去除职位名称中的标签
                strip_ele =  re.compile(r'&nbsp;|<.*?>')
                # 取出 职位名称 公司名称 职位月薪  工作地点
                zwmc = re.sub(strip_ele, '', detail[0])
                gsmc = detail[1]
                zwyx = detail[2]
                if '-' in zwyx:
                    min_money = zwyx.split('-')[0]
                    max_money = zwyx.split('-')[1]
                else:
                    min_money = max_money = '面议'
                gzdd = detail[3]
                sheet.write(count, 0,zwmc)
                sheet.write(count, 1,gsmc)
                sheet.write(count, 2,min_money)
                sheet.write(count, 3,max_money)
                sheet.write(count, 4,gzdd)
                # 行号+1
                count += 1
        # 保存
        workbook.save(u'%s职位爬取数据.xls'%unicode(self.kw, 'utf-8'))

    # 启动爬虫
    def start(self):
        self.get_index_url() # self.nextLink = '搜索网页的url地址'
        self.get_html() # self.html = html网页源代码
        self.get_result_data()
        self.save_data()

zlzp = ZLZP()
zlzp.start()

Wikipedia to rewrite the idea embarrassments of classes and objects


# coding: utf-8
import requests
import re


class QSBK(object):

    def __init__(self):
        # html属性保存html源代码
        self.html = ''
        self.total_page = 0
        self.result_data = []

    def get_html(self, page):
        url = 'https://www.qiushibaike.com/hot/page/%s/'%page
        response = requests.get(url)
        self.html = response.content

    def get_total_page(self):

        pattern1 = re.compile(r'class="dots.*?<span.*?>(.*?)</span>', re.S)
        rs = re.search(pattern1, self.html)
        # group(index)根据分组索引 查找内容
        total_page = rs.group(1)
        total_page = total_page.strip('\n')
        self.total_page = int(total_page)
        print '共%s页段子内容!' % total_page

    def get_result_data(self):
        # for 循环获取每一页的html源代码x
        for x in range(1, self.total_page + 1):
            print '正在爬取第%s页段子....' % x
            self.get_html(x)
            pattern = re.compile(
                r'<div class=".*?qiushi_tag.*?<h2>(.*?)</h2>.*?<div class="articleGender.*?>(.*?)</div>.*?<span>(.*?)</span>.*?<i class="n.*?>(.*?)</i>.*?<i.*?>(.*?)</i>',
                re.S)
            # 4.查找所有的符合规则的数据
            rs = re.findall(pattern, self.html)
            self.result_data.append(rs)

    def save_data(self):
        file_handle = open('qsbk.txt', 'w')
        for x in range(1, len(self.result_data)+1):
            rs = self.result_data[x-1]
            # 写入每页的分割线
            file_handle.write('=================第%s页===============\n' % x)
            for detail in rs:
                # 取出数据
                name = detail[0]
                # strip函数()去除字符串中的某些字符
                name = name.strip('\n')
                age = detail[1]
                content = detail[2]
                content = content.strip('\n')
                # 准备正则
                pattern1 = re.compile(r'<br/>')
                # 使用sub函数做替换
                content = re.sub(pattern1, '\n', content)
                vote_number = detail[3]
                comment_number = detail[4]
                # 写入的段子第一行 用户信息
                s1 = '用户名:%s    年龄:%s\n' % (name, age)
                file_handle.write(s1)
                # 写入的段子第二行 段子评论数 好笑数
                s2 = '好笑数:%s    评论数:%s\n' % (vote_number, comment_number)
                file_handle.write(s2)
                # 写入段子内容
                file_handle.write(content)
                file_handle.write('\n\n')
        # 3.关闭文件
        file_handle.close()

    def start(self):
        self.get_html(1)
        self.get_total_page()
        self.get_result_data()
        self.save_data()

Baidu Post Bar to rewrite the classes and objects of thought

# coding: utf-8
# 引入请求包
import requests
# 引入正则表达式包
import re

# 百度贴吧爬虫类
class BDTB(object):

    # 初始化函数,初始化爬虫需要的基本信息
    def __init__(self, numbers):
        # 帖子编号
        self.numbers = numbers
        # 记录帖子的总页数
        self.total_page = 0
        # 记录当前帖子存储文件的名称
        self.filename = ''
        # 记录html源码代码的属性
        self.html = ''
        # 记录该贴所有页数的据数据
        self.result_list = []
    # 获取html源代码的函数
    def get_html(self, page):
        # 拼接url地址
        url = 'https://tieba.baidu.com/p/%s?pn=%s'%(self.numbers, page)
        response = requests.get(url)
        # 修改html的属性值
        self.html = response.content
    # 获取要保存的文件名称
    def get_filename(self):
        title_pat = re.compile(r'<title>(.*?)</title>')
        rs = re.search(title_pat, self.html)
        print '正在爬取:%s......' % rs.group(1)
        # 修改filename属性值
        self.filename = rs.group(1) + '.txt'
    def get_total_page(self):
        pages_pat = re.compile(r'共<span class="red">(.*?)</span>')
        rs = re.search(pages_pat, self.html)
        # 修改total_page属性值
        self.total_page = int(rs.group(1))
        print '该贴共%s页,正在准备爬取,请稍后......' % self.total_page
    def get_result_data(self):
        # for循环遍历总页码次
        for x in range(1, self.total_page + 1):
            print '正在爬取第%s页,请稍后.....' % x
            # 再次根据x的值获取该页的html源代码
            self.get_html(x)
            # 7.准备提取数据的正则,使用re.S 可以匹配到任何字符
            pattern = re.compile(
                r'<li class="d_name".*?<a data-.*?>(.*?)</a>.*?<div class="d_badge_title.*?>(.*?)</div>.*?d_badge_lv">(.*?)</div>.*?<cc>(.*?)</cc>.*?<span class="tail-info.*?<a.*?>(.*?)</a>.*?<spa.*?>(.*?)</spa.*?info">(.*?)</span>',
                re.S)
            # 8.使用findall()查找所有符合正则的字符
            rs = re.findall(pattern, self.html)
            # 把爬取的这一页数据放入大列表中
            self.result_list.append(rs)
    # 保存数据
    def save_data(self):
        # 1.打开文件
        file_handle = open(self.filename.decode('utf-8'), 'w')
        # 2.for循环遍历每一页的数据
        for rs in self.result_list:
            # for 循环遍历每一条数据
            for detail in rs:
                # print detail
                # 1.取出用户名
                name = detail[0]
                # 1.1 对用户名信息进行处理
                replace_img = re.compile(r'<img.*?>')
                # 1.2 替换为-
                name = re.sub(replace_img, '-', name)
                # 2.取出头衔
                rank = detail[1]
                # 3.取出等级
                level = detail[2]
                # 4.楼层内容
                content = detail[3]
                # 4.1 替换<br>标签为\n
                content = content.replace('<br>', '\n')
                # 4.2 剔除所有的标签
                strip_ele = re.compile(r'<.*?>')
                content = re.sub(strip_ele, '', content)
                # 4.3 去除空格
                content = content.strip()
                # print content
                # 5.取出客户端
                from_device = '来自' + detail[4]
                # 如果没有客户端,就设置为来自电脑端
                if 'img' in detail[4]:
                    from_device = '来自PC电脑端'
                # 6.取出楼层
                floor_num = detail[5]
                if 'a' in floor_num:
                    floor_num = '未知'
                # 7.取出时间
                datetime = detail[6]
                if 'a' in detail[6]:
                    datetime = '未知'
                file_handle.write('***************%s******************\n' % floor_num)
                file_handle.write('用户名:%s   头衔:%s   等级%s\n' % (name, rank, level))
                file_handle.write(content)
                file_handle.write('\n')
                file_handle.write('%s 日期:%s\n\n' % (from_device, datetime))

        # 3.关闭文件
        file_handle.close()
        print '数据爬取完毕,已存入*[%s]*,请稍后自行查看!' % self.filename
    # 写一个start启动爬虫的函数
    def start(self):
        self.get_html(1) # self.html = 第一页的html源代码
        self.get_filename()# self.filename = 帖子的标题.txt
        self.get_total_page()# self.total_page = 帖子的总页数
        self.get_result_data()# self.result_list = 该贴所有页的数据
        self.save_data() # 保存数据


# 创建一个对象,传入一个帖子的编号
# bdtb = BDTB(5328438222)
# # 对象执行开始爬虫的函数
# bdtb.start()
# coding: utf-8

import requests
import re
# 从bdtb_demo中引入BDTB这个类
from bdtb_class_demo import BDTB

# 类名 首字符大写,往后每个首字符大写
# 大驼峰命名法  LinkSpider
# 小驼峰  linkSpider
class LinkSpider(object):

    def __init__(self):
        self.tb_name = ''
        # 声明属性用来记录总页数
        self.total_page = 0
        # 声明属性用来记录html源代码
        self.html = ''

    # 获取html源代码函数,需要一个page参数
    def get_html(self, page):
        url = 'https://tieba.baidu.com/f?kw=%s&ie=utf-8&tab=good&pn=%s'%(self.tb_name, (page*50))
        response = requests.get(url)
        # 修改对象的html属性值
        self.html = response.content

    # 获取总页数
    def get_total_page(self):
        count_pat = re.compile(r'精品数<span.*?>(.*?)</span>')
        rs = re.search(count_pat, self.html)
        counts = int(rs.group(1))
        if counts % 50 == 0:
            total_page = counts / 50
        else:
            total_page = counts / 50 + 1
        print '共有%s页,%s个精品贴,正在准备爬取,请稍后......' % (total_page, counts)
        # 修改对象的total_page 属性值
        self.total_page = total_page

    # 获取所有页的数据
    def get_result_data(self):

        for x in range(0, self.total_page):
            print '正在爬取第%s页精品贴.....' % (x + 1)
            # 修改了对象的html属性值
            self.get_html(x)
            # 1.准备正则
            pattern = re.compile(r'<div class="threadlist_titl.*?href="(.*?)"', re.S)
            # 2.查找符合条件的字符
            rs = re.findall(pattern, self.html)
            # for循环遍历,取出每一个帖子的编号
            for link in rs:
                # split()分割之后是一个列表,取出列表中中最后一条数据
                numbers = link.split('/')[-1]
                # 创建BDTB对象,执行start爬虫
                bdtb = BDTB(numbers)
                bdtb.start()

    def start(self):
        tb_name = raw_input('请输入要爬取的贴吧名称:')
        self.tb_name = tb_name
        self.get_html(0)
        self.get_total_page()
        self.get_result_data()


# link = LinkSpider()
# link.start()
# -*- coding: utf-8 -*-
__author__ = 'wj'
__date__ = '2018/1/11 14:32'
from link_class_demo import LinkSpider
from qsbk_demo import QSBK
from zlzp_demo import ZLZP

print '1.百度贴吧数据爬取'
print '2.糗事百科数据爬取'
print '3.智联招聘数据爬取'



sele_num = input('请选择:')

if sele_num == 1:
    obj = LinkSpider()
elif sele_num == 2:
    obj = QSBK()
else :
    obj = ZLZP()


obj.start()
Published 19 original articles · won praise 6 · views 6190

Guess you like

Origin blog.csdn.net/weixin_41580211/article/details/79089685