爬取嘉兴市人才网即时招聘信息并写入文本TXT完整案例

上一次讲到爬取嘉兴市人才网即时招聘栏目输出每一页的标题链接,点此查看,这次接着上一次的内容把完整的爬虫代码讲解完

再次声明:代码仅供技术学习交流,不作其他用途

思路:
接着上一篇文章,接下来就是循环遍历列表中的每一个url,这个url指的就是每个招聘信息的URL,并对列表的URL进行完整的拼接,然后重新请求每个完整的招聘信息链接,用xpath分别获取标题、发表日期和招聘需求,写入文本文件保存,这样就搞定啦~思路的大致代码如下:

#构造完整链接打印输出
        for i in list_url:                    
            newlist_url = "https://www.jxrsrc.com/Index/"+i
            r2 = requests.get(newlist_url,headers=headers)
            tree = html.fromstring(r2.text)
            title = tree.xpath("//span[@id='lbTitle']/text()")
            date = tree.xpath("//span[@id='lbDate']/text()")
            content = tree.xpath("//div[@id='lblcontent' and @class='about']")[0]
            info = content.xpath('string(.)')
            info = re.sub(re.compile(r'\s+'),"",info).strip()
            #print(info)
            f = open("D:\\测试.txt","a",encoding='utf-8')
            f.write(title[0]+"\n")
            f.write(date[0]+"\n")
            f.write(info+"\n")
            ct =time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
            f.write("爬取时间:"+ct+"\n")
            print("正在保存........")
            f.write("\n------------------------------------------------------------------------\n")
            #print("--------------------------------------------------------------------------")
            #print(newlist_url)
            time.sleep(1)
            f.close()

内容写入文本文件后的效果:
在这里插入图片描述
在这里插入图片描述
大家可以看到我特意在内容里加入了爬取时间,为的是不重复爬取

ct =time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
f.write("爬取时间:"+ct+"\n")

经过测试的完整代码:

import requests
import random
from lxml import html
import time
import re

#获取要爬取的URL链接
url = "https://www.jxrsrc.com/Index/Ashx/MoreInfo2.ashx"

cookie = "ASP.NET_SessionId=r2rwx4rzl4xu11e3s5131qjn; ASPSESSIONIDSGCDABQQ=GHDEIGBAJIGFIFHNICDEPCGF; Hm_lvt_8779a80c84018cd39c87c4dd911d90ba=1603981533,1604155293,1604417018,1604417059; Hm_lpvt_8779a80c84018cd39c87c4dd911d90ba=1604417066"
page_num = 0
#模拟浏览器信息
ugList = [
          "Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0",
          "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
          "Opera/9.80(Windo wsNT6.1;U;en)Presto/2.8.131Version/11.11",
          "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
          "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
          "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
          "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
          "Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1",
          "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
          "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
          "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"
]
#对ugList列表进行乱序排序
random.shuffle(ugList)
#构造浏览器请求头
headers = {
    
    "User-Agent": random.choice(ugList), "Host":"www.jxrsrc.com","Connection":"keep-alive","Accept":"*/*","Referer":"https://www.jxrsrc.com/Index/MoreInfo.aspx?TypeID=34","Accept-Encoding":"gzip, deflate, br","Accept-Language":"zh-CN,zh;q=0.9"}

try:
    for page_num in range(0,2):  #括号里是翻页的页数
        #获取formdata页面参数的内容
        time.sleep(1)
        page_num += 1
        formdata = {
    
    "PageIndex":page_num, "KeyWord": "","typeID":34}
        proxies = {
    
    "http":"192.168.1.5:49407"}
        r = requests.post(url,proxies=proxies,data=formdata,headers=headers)

        preview_html = html.fromstring(r.content.decode("utf-8"))

        # # #xpath提取相关内容
        list_url = preview_html.xpath("//dl//dt//a//@href")
        #print(list_url)
        print("--------------------------------------------这是第"+str(page_num)+"页-----------------------------------------")
		'''下面的是新的内容,接上一篇的文章代码'''
        for i in list_url:
            #构造完整链接打印输出        
            newlist_url = "https://www.jxrsrc.com/Index/"+i
            r2 = requests.get(newlist_url,headers=headers)
            tree = html.fromstring(r2.text)
            title = tree.xpath("//span[@id='lbTitle']/text()")
            date = tree.xpath("//span[@id='lbDate']/text()")
            content = tree.xpath("//div[@id='lblcontent' and @class='about']")[0]
            info = content.xpath('string(.)')
            info = re.sub(re.compile(r'\s+'),"",info).strip()
            #print(info)
            f = open("D:\\测试.txt","a",encoding='utf-8')
            f.write(title[0]+"\n")
            f.write(date[0]+"\n")
            f.write(info+"\n")
            ct =time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
            f.write("爬取时间:"+ct+"\n")
            print("正在保存........")
            f.write("\n------------------------------------------------------------------------\n")

            #print("--------------------------------------------------------------------------")
            #print(newlist_url)
            time.sleep(1)
            f.close()
        print("保存完毕........")

except Exception as e:
    print(e)

    time.sleep(1)


猜你喜欢

转载自blog.csdn.net/weixin_51424938/article/details/112689498