Python爬虫之-前程无忧招聘网站(可运行源码)

今天爬了前程无忧,分享下代码~

可以直接运行的,也很简单,就不做注释了。

原创:进制转载

#coding:utf-8
import json
import time
import urllib
import urllib2
import re

#__author__='小菜菜1223'
h = open('qianchengwuyou.txt','a')
def run(num,typ):
    url = 'https://search.51job.com/list/120300,000000,0000,00,9,99,' + str(typ) + ',2,' + str(num) + '.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    a = urllib2.urlopen(url)
    html = a.read()
    #print html
    s = get(html)
    return s

def get(html):
    reg2 = '<span class="t2"><a target="_blank" title=.*? href="https://jobs.51job.com/all/(.*?)">(.*?)</a></span>.*?'
    reg1 = '<span>.*?<a target="_blank" title="(.*?)" href=".*?" onmousedown="">.*?'
    reg3 = '<span class="t4">(.*?)</span>'
    reg = reg1 + reg2 + reg3
    reg = reg.decode('utf-8')
    reg = re.compile(ur'%s' % reg,re.S)
    items = re.findall(reg,html)
    if items == '':
        return 'no'
    #print len(items)
    
    #li= json.dumps(items,ensure_ascii=False)
    #print li
    do(items)
def do(li):
    a = ''
    
    for i in li:
        for j in range(0,len(i)):
            if j == 2:
                x = main(i[1])
                a+=x                         
            a+=i[j].strip() + ','
        h.writelines(a + '\n')
        a = ''
       
def main(s):
    #print s
    url = "https://jobs.51job.com/all/" + s
    a = urllib2.urlopen(url)
    time.sleep(0.01)
    html = a.read()
    reg = '<p class="ltype">\r\n          \t\t(.*?)          \t\t          \t\t\t  |  (.*?)          \t\t          \t\t          \t\t\t  |  (.*?)          \t\t          </p>\r\n          <div class="clear"></div>'
    reg = reg.decode('utf-8')
    reg = re.compile(ur'%s' % reg,re.S)
    items = re.findall(reg,html)
    s = ''
    
    for i in items:
        for j in i:
            if j:
                s+=j.strip() + ','
    return s
                
    #li= json.dumps(items,ensure_ascii=False)
    #print li
    
       
lis = ['.net','C%2523','java','.NET','%25E6%2595%25B0%25E6%258D%25AE%25E5%25BA%2593','DBA','%25E5%2589%258D%25E7%25AB%25AF','%25E6%25B5%258B%25E8%25AF%2595','%25E9%25A1%25B9%25E7%259B%25AE%25E7%25AE%25A1%25E7%2590%2586','%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE','%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598','ios','%25E5%25AE%2589%25E5%258D%2593','Android','QA','UI']
count = 1
for i in lis:
    while 1:
            
        print count,i
        result = run(count,i)
        if result == 'no' or count == 5:
               count = 1
               break
        count+=1

h.close()
   

猜你喜欢

转载自blog.csdn.net/qq_40771567/article/details/80394545