Python study notes 14: crawl 51job data and write it into excel

Because of job needs, sometimes it is necessary to understand the job information and requirements of industry companies.

In the past, some companies were selected by manual search, and copied and pasted into excel as a reference for some materials.

After understanding Python, this work must be done by Python.

In line with the idea of ​​"from small to large" writing program functions, I wrote 3 modules:

1. Excel operation module: responsible for creating excel documents and writing data into excel documents

2. Single-page data acquisition module: Responsible for acquiring the job name, salary, employment information and other data on the page, and return the data list

3. Comprehensive module: traverse the search result pages, get the URL of the page number, and get the company's work information URL in each URL. Based on these URLs, the data is obtained one by one and stored in an excel file.

1. Excel operation module code:

excelOp.py uses the  openpyxl module and uses it to operate excel.

# -*- coding:utf-8 -*-
import openpyxl
from openpyxl.styles import Border,Side,PatternFill,Font,Alignment

# 头部内容数据
dataHeader = ["公司名称", "岗位名称", "经验要求", "月薪", "工作要求", "省市", "工作地点","网址"]

# 头部表格宽度数据
headerWidth = {
    "A": 20,
    "B": 20,
    "C": 15,
    "D": 20,
    "E": 50,
    "F": 15,
    "G": 15,
    "h": 20
}

# 头部字体数据
headerFont = Font('宋体', size=12, bold=True, italic=False, strike=False, color='000000')

# 表格边框样式
border = Border(left=Side(border_style='thin', color='000000'),
                right=Side(border_style='thin', color='000000'),
                top=Side(border_style='thin', color='000000'),
                bottom=Side(border_style='thin', color='000000'))

# 表格头部颜色
headerFill = PatternFill(fill_type="solid", start_color='dddddd', end_color='dddddd')

# 表格对齐方式
headerAlign = Alignment(horizontal="center", vertical="center", wrap_text=False)
contentAlign = Alignment(horizontal="center", vertical="center", wrap_text=True)

def initExcel(excelName):
    '''
    初始化excel文件,生成表格头部
    :param excelName: 生成 excel 文件的名字
    :return:
    '''
    # 实例化
    wb = openpyxl.Workbook()
    # 创建一个表
    # ws = wb.create_sheet("51job工作统计")
    ws = wb.active  # 激活默认表
    ws.title = excelName  # 更改默认表的名字

    # 生成表格头部
    for index, value in enumerate(dataHeader):
        headerCell = ws.cell(row=1, column=index + 1)
        headerCell.value = value
        # 设置单元格样式:边框,填充色,字体,对齐方式
        headerCell.border = border
        headerCell.fill = headerFill
        headerCell.font = headerFont
        headerCell.alignment = headerAlign

    # 控制表格头部宽度
    ws.row_dimensions[1].height = 30
    for item in headerWidth:
        ws.column_dimensions[item].width = headerWidth[item]

    wb.save( excelName+".xlsx")


def insertData(wb,dataList,rowNum):
    '''
    把输入插入到 excel 文件中
    :param wb: excel对象
    :param dataList: 数据
    :param rowNum:   要插入的第几行。
    :return:
    '''
    ws = wb[ wb.sheetnames[0] ]  # 获取第一个sheet。excel 文件中就这么一个表
    # 插入数据
    for i,item in enumerate(dataList) :
        c = ws.cell(row=rowNum, column=i+1, value=item)
        # 设置单元格样式:边框,文字对齐方式
        c.border = border
        c.alignment = contentAlign

2. Single page get data module

singlepage.py mainly uses the requests module to request related content of the corresponding page.

Reuse BS4 module or a regular expression , corresponding to the selected content.

# -*- coding:utf-8 -*-
import os,requests,re
import random
from urllib.parse import quote
from bs4 import BeautifulSoup  # 解析 HTML 的模块 导入 BS ,版本 4。
import lxml   # bs4的解析插件模块

def handlePageData(weburl,header):
    '''
    处理页面信息, 返回页面相关数据
    :param weburl: 请求的网址
    :param header: 请求的头信息,随机抽取的一个
    :return : 返回页面相关数据 list
    '''
    tagURL = re.sub(r'\\',"",weburl)
    req = requests.get(url=tagURL, headers=header)
    req.encoding = req.apparent_encoding

    webHTML = req.text

    bs = BeautifulSoup(webHTML, "lxml")

    # 公司名称
    try:
        companyName = bs.select(".tHjob .cname .catn")[0].get_text().strip()
    except:
        companyName = "某公司"


    # 岗位名称
    try:
        jobTiltle = bs.select(".tHjob h1")[0].get_text().strip()
    except:
        jobTiltle = "暂无"

    # 岗位待遇
    try:
        jobMoney = bs.select(".tHjob .cn strong")[0].get_text().strip()
    except:
        jobMoney = "面谈"
    else:
        jobMoney = re.findall('(.*?)/月', jobMoney, re.I | re.DOTALL)
        if not jobMoney:  # 如果 jobMoney 为空,说明没有找到。
            jobMoney = "面谈"
        else:
            jobMoney = jobMoney[0]

    # 岗位详细信息
    try:
        jobMsg = str(bs.select(".job_msg")[0]).strip()
    except:
        jobMsg = "暂无"
    else:
        reg = '<div class="bmsg job_msg inbox">(.*?)<div class="mt10">'
        # 信息
        jobMsg = re.findall(reg, jobMsg, re.I | re.DOTALL)[0].strip().split("<br/>")
        jobMsg = "\n".join(jobMsg)
        jobMsg = BeautifulSoup(jobMsg, "lxml").get_text()

    # 岗位经验和地址
    try:
        jobAddrInfo = bs.select("p.ltype")[0].get_text().strip()
    except:
        jobAddrInfo = "暂无"
    # 岗位经验
    jyReg = r'\|\xa0\xa0(.*?)经验'
    try:
        jobExp = re.findall(jyReg, jobAddrInfo, re.I | re.DOTALL)[0].strip()
    except:
        jobExp = "0"
    else:
        # 去掉年
        jobExp = re.findall('(.*?)年', jobExp, re.I | re.DOTALL)
        if not jobExp:  # 如果 jobExp 为空,说明没有找到。值为“无需”经验
            jobExp = "0"
        else:
            jobExp = jobExp[0]+"年"

    # 公司地址
    addReg = r'^(.*?)\xa0\xa0\|'
    try:
        jobAddr = re.findall(addReg, jobAddrInfo.strip(), re.I | re.DOTALL)[0]
    except:
        jobAddr = "暂无"

    # 上班地址
    comAddrReg = r'<span class="label">上班地址:</span>(.*?)</p>'
    companyAddr = re.findall(comAddrReg, webHTML, re.I | re.DOTALL)
    if not companyAddr:
        companyAddr = "上班地址暂无"
    else:
        companyAddr = companyAddr[0]

    # 返回页面相关数据
    return [
        companyName,  # 公司名
        jobTiltle,    # 岗位名
        jobExp,       # 工作经验
        jobMoney,
        jobMsg,
        jobAddr,
        companyAddr,
        tagURL        # 网址
    ]

3. Integrate the main module

Request the first page of search results and find all pages by turning the page code.

Record the relevant company pages of each page one by one, traverse in turn, search for the data, and finally save it in excel.

Here are a few details:

(1) The data is written uniformly last. To prevent the large amount of data, the operation of writing excel data is too frequent .

(2) The requested headers are put into a list, and each time a page is requested, one is randomly selected to simply prevent the anti-crawl mechanism of the website .

(3) The position to be requested is manually entered, and then converted into a url address code by urllib, and then pieced together into the requested address.

# -*- coding:utf-8 -*-
import os,requests,re
import random
from urllib.parse import quote
from bs4 import BeautifulSoup  # 解析 HTML 的模块 导入 BS ,版本 4。
import lxml   # bs4的解析插件模块
# excel 操作模块
import openpyxl
from openpyxl.styles import Border,Side,PatternFill,Font,Alignment

import singlepage, excelOp

# 设置headers 为了防止反扒,设置多个headers
# chrome,firefox,Edge
headers = [
    {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive'
    },
    {
        "User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive'
    },
    {
        "User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19041',
        'Accept-Language': 'zh-CN',
        'Connection': 'keep-alive'
    }
]

inputTag = None  # 搜索的工作职位

def getURL(inputTag,page=1):
    """
    :param page: 页码
    :return: 返回组合后的网址
    """
    # 组合为目标地址
    weburl = r"https://search.51job.com/list/060000,000000,0000,00,9,99,{tag},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(
        tag=quote(inputTag), page=page)
    return weburl

def getWebHTML(weburl):
    """
    获取指定页面的HTML
    """
    req = requests.get( url=weburl,headers=random.choice(headers) )
    req.encoding = req.apparent_encoding  # 防止中文乱码
    webHTML = req.text
    print( "页面HTML获取成功!"+ weburl )
    return webHTML

def getTotalPage(webHTML):
    """
     获取翻页里的页面
    """
    pageList = []  # 搜索出来的页面(数据来自页码)
    reg = '"total_page":"(.*?)"'
    totalPage = re.findall(reg, webHTML ,re.DOTALL|re.I)[0]
    print("获取总的页码数:"+totalPage  )
    for i in range(1,int(totalPage)+1):  # 把翻页中的页码,添加到 全局变量pageList 中
        pageList.append( getURL(inputTag, i) )
        # print(getURL(inputTag, i))
    return pageList

def getAllCompanyPage( pageList ):
    """
    :param pageList: 翻页里的页面 list
    :return: 所有公司的招聘信息网址
    """
    companyList = []
    reg = '"job_href":"(.*?)"'
    for pageItem in pageList:
        pageHTML = getWebHTML(pageItem)
        company = re.findall( reg, pageHTML , re.I)
        companyList = companyList + company
    print("搜索出相关公司:"+ str( len(companyList) ) + "个")
    return companyList

def main():
    """
    主函数
    """
    global inputTag
    inputTag = input("请输入要爬取的工作岗位:")
    weburl = getURL(inputTag,1)   # 获得目标地址。默认第一页
    webHTML = getWebHTML(weburl)  # 获取HTML
    pageList = getTotalPage(webHTML)  # 获取翻页里的页面
    companyList = getAllCompanyPage( pageList )   # 所有公司的招聘信息网址
    # 处理公司的单独页面,获取相关信息,并保存在 excel 文件中。
    # 初始化表格
    excelOp.initExcel(inputTag)
    wb = openpyxl.load_workbook(inputTag+".xlsx")
    # 处理所有公司页面信息
    for i,item in enumerate(companyList):
        # 获取每个页面的数据信息
        # print( item )
        singleInfo = singlepage.handlePageData(item, random.choice(headers) )
        # 把这些数据插入到excel 表格中
        excelOp.insertData(wb,singleInfo, i+2)
        print(singleInfo[0]," 的数据获取成功")
    wb.save(inputTag+".xlsx")
    os.rename(inputTag+".xlsx", inputTag+"_"+str(len(companyList))+".xlsx")

if __name__ == '__main__':
    main()

summary

Although the function is realized, the manual operation is liberated and the work is convenient. But I personally feel that the code is still relatively rough, and I feel that it needs to be improved.

I also encountered a few pits midway, here are the records

(1) Loop list 

The for-in loop can only get the elements of the list. To get the index, you should:

 for i,item in enumerate(companyList):
    print(i,item)    # 索引,数据项

(2) It is possible that data acquisition fails

Sometimes when data cannot be found in the page, you must use a try statement to give a fault tolerance solution.

try:
    jobTiltle = bs.select(".tHjob h1")[0].get_text().strip()
except:
    jobTiltle = "暂无"   # 没有获取到数据
else:
    ....    # 正常获取数据的处理

Python has a long way to go, keep working hard~!

Attach a picture of the result:

Guess you like

Origin blog.csdn.net/weixin_42703239/article/details/107976957