import requests
from lxml import etree
from day5 import helper
def tx_zhaoping(num):
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
x_url = 'https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a'
for l in range(0,num):
url = x_url.format(l)
#数据库实例化
myhelper = helper.MysqlHelper()
#sql语句
sql = 'insert into tx_zhaoping(didian,leixing,renshu,position,duty,requirement)values (%s,%s,%s,%s,%s,%s)'
response = requests.get(url,headers=headers)
# with open('zhaoping.html','wb') as f:
# f.write(response.content)
html_ele = etree.HTML(response.text)
li_list = html_ele.xpath('//div[@id="position"]/div[1]/table/tr/td/a/@href')
#获取每一页中职位的详情页url
#
for li_ele in li_list:
next_url ='https://hr.tencent.com/'+li_ele
#拼接url
response_list = requests.get(next_url,headers=headers)
# with open('xiangqing.html','wb' ) as f:
# f.write(response_list.content)
xiangqing_ele = etree.HTML(response_list.text)
didian = xiangqing_ele.xpath('//div[@ id ="position_detail"]/div[1]/table/tr[2]/td/text()')[0]
#地点
# print(didian)
leixing = xiangqing_ele.xpath('//div[@ id ="position_detail"]/div[1]/table/tr[2]/td/text()')[1]
#工作类型
# print(leixing)
renshu = xiangqing_ele.xpath('//div[@ id ="position_detail"]/div[1]/table/tr[2]/td/text()')[2]
#所需人数
# print(renshu)
position = xiangqing_ele.xpath('//div[@ id ="position_detail"]/div[1]/table/tr[1]/td')[0].text
#招聘职位
print(position)
duty = xiangqing_ele.xpath('//div[@ id ="position_detail"]/div[1]/table/tr[3]/td/ul/li')
#工作责任
duty_list = []
requirement_list = []
for j in duty:
duty_list.append(j.text)
duty_type = ''.join(duty_list)
print(duty_type)
requirement = xiangqing_ele.xpath('//div[@ id ="position_detail"]/div[1]/table/tr[4]/td/ul/li')
#工作要求
for v in requirement:
# print(v.text)
requirement_list.append(v.text)
#因为要求中有空值 if 循环剔除控制并转为字符串
if None in requirement_list:
requirement_list.remove(None)
requirement_type = ''.join(requirement_list)
print(requirement_type)
# print(requirement_type)
#数据库存取
data = (didian,leixing,renshu,position,duty_type,requirement_type)
myhelper.execute_modify_sql(sql,data)
if __name__ == '__main__':
num = int(input('请输入你要爬取的招聘信息页数'))
num = 10 *num
tx_zhaoping(num)
下面是实例化数据库的代码
import pymysql
class MysqlHelper(object):
def __init__(self):
self.db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', database='lianjia', charset='utf8')
self.cursor = self.db.cursor()
def execute_modify_sql(self,sql, data):
self.cursor.execute(sql, data)
self.db.commit()
def __del__(self):
self.cursor.close()
self.db.close()
# if __name__ == '__main__':
# conn = MysqlHelper()
# conn.execute_modify_sql('insert into lianjiaxinxi(title) VALUE (%s)', data=('huzeqi hehehe'))