智联招聘网的工作岗位爬取
一:页数的处理
def zhi_page(url, startpage, endpage): for page in range(startpage, endpage): page = url + str(page) return page
二:响应数据的获取
import urllib.request import urllib.parse from lxml import etree def zhi_url(page): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=page, headers=headers) res = urllib.request.urlopen(req) return res
三:网页上想要数据的匹配
def zhi_spider(res): html = res.read() tree = etree.HTML(html) zhi_list = tree.xpath('//div[@id="newlist_list_content_table"]/table') zhis = {} for zhi in zhi_list: zhis['职位类型'] = zhi.xpath('.//div/a/text()') zhis['公司名称'] = zhi.xpath('.//td[@class="gsmc"]//text()') zhis['待遇'] = zhi.xpath('.//td[@class="zwyx"]/text()') zhis['工作地点'] = zhi.xpath('.//td[@class="gzdd"]/text()') zhis['发布时间'] = zhi.xpath('.//td[@class="gzdd"]/text()') return zhis
四:主函数
def main(): dd = input('输入工作的地点') dd = urllib.parse.quote(dd) work = input('输入工作的类型') startpage = int(input('输入起始页')) endpage = int(input('输入结束页')) url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=" + dd + "&kw=" + work + "&sm=0&p=" text = zhi_spider(zhi_url(zhi_page(url, startpage, endpage))) return text if __name__ == '__main__': main()
整体代码的样式
import urllib.request import urllib.parse from lxml import etree def zhi_url(page): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=page, headers=headers) res = urllib.request.urlopen(req) return res def zhi_page(url, startpage, endpage): for page in range(startpage, endpage): page = url + str(page) return page def zhi_spider(res): html = res.read() tree = etree.HTML(html) zhi_list = tree.xpath('//div[@id="newlist_list_content_table"]/table') zhis = {} for zhi in zhi_list: zhis['职位类型'] = zhi.xpath('.//div/a/text()') zhis['公司名称'] = zhi.xpath('.//td[@class="gsmc"]//text()') zhis['待遇'] = zhi.xpath('.//td[@class="zwyx"]/text()') zhis['工作地点'] = zhi.xpath('.//td[@class="gzdd"]/text()') zhis['发布时间'] = zhi.xpath('.//td[@class="gzdd"]/text()') return zhis def main(): dd = input('输入工作的地点') dd = urllib.parse.quote(dd) work = input('输入工作的类型') startpage = int(input('输入起始页')) endpage = int(input('输入结束页')) url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=" + dd + "&kw=" + work + "&sm=0&p=" text = zhi_spider(zhi_url(zhi_page(url, startpage, endpage))) return text if __name__ == '__main__': main()