爬虫目的:爬取某直聘的岗位信息,如果是新岗位,就发送邮件至指定邮箱。
重点:
- selenium的使用以及如何避免被检测。
- pyquery解析数据的规则。
import time,random
import redis
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from pyquery import PyQuery as pq
from mylibrary import send_mail
rs_conn=redis.Redis(host="localhost", port='6379', db=5, password='888888', decode_responses=True) # redis数据库连接
# options.add_argument("--proxy-server=http:/115.218.0.233:9000")
# options.add_argument("--disable-blink-features=AutomationControlled") #告诉chrome去掉了webdriver痕迹
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# options.add_experimental_option('useAutomationExtension', False)
def validate_ip(): #如果被反爬,就手动点选图片进行验证
driver=webdriver.Chrome()
driver.set_window_size(800,500)
time.sleep(1)
url='https://www.zhipin.com/c101210100/?query=爬虫&page=1'
driver.get(url)
time.sleep(1)
if '异常' in driver.page_source: #如果出现此提示,需要手动验证。
time.sleep(10)
driver.quit()
def crawl_zhipin():
'''
:return new positsion info
'''
options=webdriver.ChromeOptions()
options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"')
options.add_argument('--headless')
# proxy='http://111.3.118.247:30001'
# options.add_argument("--proxy-server=%s" % proxy)
driver=webdriver.Chrome(chrome_options=options)
message=''
positionqty=0
for page in range(1,3):
kw='爬虫'
url=f'https://www.zhipin.com/c101210100/?query={kw}&page={page}&ka=page-{page}'
# print(url)
print(f'crawling page {page}:',url)
driver.get(url)
time.sleep(1)
# driver.minimize_window()
html=driver.page_source
print(type(html))
doc=pq(html)
jobs=doc('.job-primary')
for item in jobs.items():
company=item(".name a" ).attr('title')
# company=item("h3[class='name'] a" ).attr('title')
salary=item('.red ').text()
position=item('.job-name a').attr.title
joblink=item(".job-name a").attr.href
jobid=item('.job-name a').attr('data-jobid')
joblink='https://www.zhipin.com'+joblink
# position=item('.job-name a').text()
print('{',company,position,salary,joblink,"}")
if rs_conn.sadd("bosszhipin_id",jobid): #如果插入成功,表示为新岗位
positionqty+=1
message += '<tr><td>%s</td><td>%s</td><td>%s</td><td><a href=%s>%s</a></td></tr>' %(company,position,salary,joblink,joblink) #新岗位
print('{',company,position,salary,joblink,"}")
# interval=random.randint(0,2)
interval=random.random()
print(interval)
time.sleep(interval)
driver.quit()
print('new job qty:',positionqty)
return message
if __name__=='__main__':
validate_ip()
message=crawl_zhipin()
if message!='':
# print(message)
message = '<html><head><title>岗位新闻</title></head><body><table border=1 cellspacing=1>' + '<tr><td>公司名称</td><td>岗位名称</td><td>薪资</td><td>链接地址</td></tr>' + message + '</table></body></html>'
send_mail('BOSS直聘‘爬虫’岗位',message) #发送邮件