今天爬了前程无忧,分享下代码~
可以直接运行的,也很简单,就不做注释了。
原创:进制转载
#coding:utf-8
import json
import time
import urllib
import urllib2
import re
#__author__='小菜菜1223'
h = open('qianchengwuyou.txt','a')
def run(num,typ):
url = 'https://search.51job.com/list/120300,000000,0000,00,9,99,' + str(typ) + ',2,' + str(num) + '.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
a = urllib2.urlopen(url)
html = a.read()
#print html
s = get(html)
return s
def get(html):
reg2 = '<span class="t2"><a target="_blank" title=.*? href="https://jobs.51job.com/all/(.*?)">(.*?)</a></span>.*?'
reg1 = '<span>.*?<a target="_blank" title="(.*?)" href=".*?" onmousedown="">.*?'
reg3 = '<span class="t4">(.*?)</span>'
reg = reg1 + reg2 + reg3
reg = reg.decode('utf-8')
reg = re.compile(ur'%s' % reg,re.S)
items = re.findall(reg,html)
if items == '':
return 'no'
#print len(items)
#li= json.dumps(items,ensure_ascii=False)
#print li
do(items)
def do(li):
a = ''
for i in li:
for j in range(0,len(i)):
if j == 2:
x = main(i[1])
a+=x
a+=i[j].strip() + ','
h.writelines(a + '\n')
a = ''
def main(s):
#print s
url = "https://jobs.51job.com/all/" + s
a = urllib2.urlopen(url)
time.sleep(0.01)
html = a.read()
reg = '<p class="ltype">\r\n \t\t(.*?) \t\t \t\t\t | (.*?) \t\t \t\t \t\t\t | (.*?) \t\t </p>\r\n <div class="clear"></div>'
reg = reg.decode('utf-8')
reg = re.compile(ur'%s' % reg,re.S)
items = re.findall(reg,html)
s = ''
for i in items:
for j in i:
if j:
s+=j.strip() + ','
return s
#li= json.dumps(items,ensure_ascii=False)
#print li
lis = ['.net','C%2523','java','.NET','%25E6%2595%25B0%25E6%258D%25AE%25E5%25BA%2593','DBA','%25E5%2589%258D%25E7%25AB%25AF','%25E6%25B5%258B%25E8%25AF%2595','%25E9%25A1%25B9%25E7%259B%25AE%25E7%25AE%25A1%25E7%2590%2586','%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE','%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598','ios','%25E5%25AE%2589%25E5%258D%2593','Android','QA','UI']
count = 1
for i in lis:
while 1:
print count,i
result = run(count,i)
if result == 'no' or count == 5:
count = 1
break
count+=1
h.close()