Python爬虫之-智联招聘网站

爬着玩玩用的,这个做的不好,还爬取过前程无忧是认真做的~

#coding:utf-8
import re
import json
import xlwt
import time
import urllib,urllib2

h=open('zhaopin.txt','a')

#__authour__:小菜菜1223
def run(num):
    url='https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%9D%92%E5%B2%9B&kw=UI&sm=0&p='+str(num) #UI可修改成 .net、java、C%23、数据库/DBA/前端、测试、项目管理、大数据、数据挖掘、ios、安卓、QA、UI
    #url='https://www.baidu.com/'
    a=urllib2.urlopen(url)
    html=a.read().decode('utf-8')
    if not html:
        return 'no'
    get(html)
def get(html):
    reg1='<td class="gsmc"><a href="http://company.zhaopin.com/(.*?)" target="_blank">.*? '#名称
    reg2='<td class="zwyx">(.*?)</td>.*?'#薪资
    reg3='<td class="gzdd">(.*?)</td>.*?'#所在地
    reg4='.*?<li class="newlist_deatil_two"><span>.*?</span><span>公司性质:.*?</span><span>公司规模:(.*?)</span><span>.*?'#规模
    
    reg5='.*?<li class="newlist_deatil_two"><span>.*?</span><span>公司性质:(.*?)</span><span>.*?</span><span>.*?'#性质    
    reg6='.*?.htm" target="_blank">.*?<b>(.*?)</b>(.*?)</a>.*?'#职位
    #reg6='.*?.htm" target="_blank">(.*?)</a>.*?'
    reg7='<td class="gsmc"><a href="(.*?)" target="_blank">'
    reg=reg6+reg1+reg2
    reg=reg.decode('utf-8')
    reg=re.compile(ur'%s'%reg,re.S)
    
    items=re.findall(reg,html)
   
    li= json.dumps(items,ensure_ascii=False)
    
    do(li)
def do(li):
    li=json.loads(li)
    
    for i in li:
        try:
            s=main(i[2])
            s=json.loads(s)
        
        
            for j in s:
                if len(i[1])>8:
                    continue
                s=i[0]+','+i[1]+','+i[3]+','+','+','+j[0]+','+j[1]+','+j[2]+','+j[3]+','+j[4]
                print s
                h.writelines(s.encode('utf-8')+'\n')
        except:
            print 'pass'
            continue
def main(s):
    try:
        url='http://company.zhaopin.com/'+s
        a=urllib2.urlopen(url)
        html=a.read().decode('utf-8')
        reg0='<title>(.*?)招聘信息_电话_地址-智联招聘</title>.*?'
        reg1='<span class="comAddress">(.*?)</span>'
        reg2='<table class="comTinyDes">.*?<span>公司性质:</span>.*?<span>(.*?)</span>.*?<span>公司规模:</span>.*?<span>(.*?)</span>.*?<span>公司行业:</span>.*?<span>(.*?)</span>.*?<span>公司地址:.*?'
        reg=reg0+reg2+reg1
        reg=reg.decode('utf-8')
    
        reg=re.compile(ur'%s'%reg,re.S)
        items=re.findall(reg,html)
        li= json.dumps(items,ensure_ascii=False)
        #print li
        
        time.sleep(0.01)
        return li
    except:
        s=['wu','wu']
        return s
    
count=1


while 1 :
    print count
    result=run(count)
    if result=='no':
        break
        time.sleep(0.001)
    count+=1
    
    

猜你喜欢

转载自blog.csdn.net/qq_40771567/article/details/80395265