爬取智联招聘信息并存储

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/california94/article/details/79924202
#-*- coding: utf-8 -*-
import urllib.request
import os,time
from bs4 import BeautifulSoup
#爬取智联招聘网站的招聘种类,然后再爬取某个种类的招聘信息,并将招聘信息以种类名称命名存储。

hds=[{'User-Agent': 'Mozilla/5.0 (Windows; U;Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \
     {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \
     {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]

position=[] #存放职位名称
href={}     #存放职位的连接地址
url="http://sou.zhaopin.com/" #智联总网页

def search():
    try:
        req=urllib.request.Request(url,headers=hds[1])
        html_resource=urllib.request.urlopen(req).read()
        plain_text=html_resource.decode('utf8','ignore')
        return plain_text
    except (urllib.request.HTTPError,urllib.request.URLError) as e :
        print(e)

def rmline(str):
    #这是一个去除字符串中的空行的函数
    data=''
    for line in str:
        l=line.strip()
        if len(l)!=0:
            data+=l
    return data

def parse_html(html):
    soup=BeautifulSoup(html,'lxml')
    items=soup.find("div",class_="clearfixed").children
    for item in items:
        position.append(item.get_text())
        href[item.get_text()]=item.get('href')
    # for key in href:
    #     print(key+'----> '+href[key])

def search_href(href,hds):
    try:
       req=urllib.request.Request(href,headers=hds)
       html_resource=urllib.request.urlopen(req).read()
       plain_text = html_resource.decode('utf8', 'ignore')
       return plain_text
    except (urllib.request.HTTPError,urllib.request.URLError) as e:
        print(e)
def parse_href_html(html,position_name):
    position_name=position_name.replace("/",' ')
    file=open(position_name+".txt",'w',encoding="utf-8")
    soup=BeautifulSoup(html,'lxml')
    items=soup.find_all("table",class_="newlist")
    for item in items[1:]:
        zwmc=item.find("td",class_="zwmc").div.a.string
        gsmc=item.find("td",class_="gsmc").a.string
        zwyx=item.find("td",class_="zwyx").string
        gzdd=item.find("td",class_="gzdd").string
        file.write(zwmc+'  '+gsmc+'  '+zwyx+'  '+gzdd+'\n')
    file.close()

if __name__=="__main__":
    parse_html(search())
    i=0;
    for key in href:
        parse_href_html(search_href(url+href[key],hds[i%3]),key)
        i=i+1

猜你喜欢

转载自blog.csdn.net/california94/article/details/79924202