爬着玩玩用的,这个做的不好,还爬取过前程无忧是认真做的~
#coding:utf-8
import re
import json
import xlwt
import time
import urllib,urllib2
h=open('zhaopin.txt','a')
#__authour__:小菜菜1223
def run(num):
url='https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%9D%92%E5%B2%9B&kw=UI&sm=0&p='+str(num) #UI可修改成 .net、java、C%23、数据库/DBA/前端、测试、项目管理、大数据、数据挖掘、ios、安卓、QA、UI
#url='https://www.baidu.com/'
a=urllib2.urlopen(url)
html=a.read().decode('utf-8')
if not html:
return 'no'
get(html)
def get(html):
reg1='<td class="gsmc"><a href="http://company.zhaopin.com/(.*?)" target="_blank">.*? '#名称
reg2='<td class="zwyx">(.*?)</td>.*?'#薪资
reg3='<td class="gzdd">(.*?)</td>.*?'#所在地
reg4='.*?<li class="newlist_deatil_two"><span>.*?</span><span>公司性质:.*?</span><span>公司规模:(.*?)</span><span>.*?'#规模
reg5='.*?<li class="newlist_deatil_two"><span>.*?</span><span>公司性质:(.*?)</span><span>.*?</span><span>.*?'#性质
reg6='.*?.htm" target="_blank">.*?<b>(.*?)</b>(.*?)</a>.*?'#职位
#reg6='.*?.htm" target="_blank">(.*?)</a>.*?'
reg7='<td class="gsmc"><a href="(.*?)" target="_blank">'
reg=reg6+reg1+reg2
reg=reg.decode('utf-8')
reg=re.compile(ur'%s'%reg,re.S)
items=re.findall(reg,html)
li= json.dumps(items,ensure_ascii=False)
do(li)
def do(li):
li=json.loads(li)
for i in li:
try:
s=main(i[2])
s=json.loads(s)
for j in s:
if len(i[1])>8:
continue
s=i[0]+','+i[1]+','+i[3]+','+','+','+j[0]+','+j[1]+','+j[2]+','+j[3]+','+j[4]
print s
h.writelines(s.encode('utf-8')+'\n')
except:
print 'pass'
continue
def main(s):
try:
url='http://company.zhaopin.com/'+s
a=urllib2.urlopen(url)
html=a.read().decode('utf-8')
reg0='<title>(.*?)招聘信息_电话_地址-智联招聘</title>.*?'
reg1='<span class="comAddress">(.*?)</span>'
reg2='<table class="comTinyDes">.*?<span>公司性质:</span>.*?<span>(.*?)</span>.*?<span>公司规模:</span>.*?<span>(.*?)</span>.*?<span>公司行业:</span>.*?<span>(.*?)</span>.*?<span>公司地址:.*?'
reg=reg0+reg2+reg1
reg=reg.decode('utf-8')
reg=re.compile(ur'%s'%reg,re.S)
items=re.findall(reg,html)
li= json.dumps(items,ensure_ascii=False)
#print li
time.sleep(0.01)
return li
except:
s=['wu','wu']
return s
count=1
while 1 :
print count
result=run(count)
if result=='no':
break
time.sleep(0.001)
count+=1