学习Python的日子 爬虫(4)

实战案例--得到天猫第一页的产品id

import requests
import re
from lxml import etree
headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
    "cache-control": "max-age=0",
    "cookie": "hng=CN%7Czh-CN%7CCNY%7C156; cna=ec6fE0YtC1ACAXlFUabPKpCR; _m_h5_tk=ae0dd1f3d3714b382a1cf17af8214fca_1530877233892; _m_h5_tk_enc=4e6ac4e9cdbfa3d176b3631f68908573; lid=%E8%A5%BF%E8%A5%BF0227; _med=dw:1366&dh:768&pw:1366&ph:768&ist:0; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; enc=QKkzqQkEjKpB%2FSQo6gohk5JuHIQJIADh5Z%2BAh%2FPMYsBd2EpevHd1Pjx9GzFo9DTKdGncP5AEDaIqrD46ftYJEQ%3D%3D; t=f6190d2ecfa923f4e501c8c27aa7e729; uc3=vt3=F8dBzr%2FH0F%2BI0bvn6NA%3D&id2=UUpnjd%2BYSqP9Hw%3D%3D&nk2=rW6vAL48MIc%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D; tracknick=%5Cu897F%5Cu897F0227; lgc=%5Cu897F%5Cu897F0227; _tb_token_=73f3eefb5eb1e; cookie2=13466b0dba8d366e8186c5fc759a66de; tt=tmall-main; cq=ccp%3D1; swfstore=125004; res=scroll%3A1347*5924-client%3A1347*229-offset%3A1347*5924-screen%3A1366*768; pnm_cku822=098%23E1hvOQvUvbpvUvCkvvvvvjiPPss9gji2n2zWljYHPmPysjrbRLMhzjE8Rs59ljECiQhvCvvv9UUEvpCWm2Pzvvakfw1lYb8rwAT%2BkUcEhBOiHWLp6W97RAYVyO2vqbVQWl4vzRFE%2BFIlBqevD70Xderv%2B8c6sEuOwHsXS47BhC3qVUcnDOvwjOyCvvOUvvVvayTivpvUvvmvWk%2Fmq8TtvpvIvvvvvhCvvvvvvUUuphvvDQvv9krvpvQvvvmm86CvmVWvvUUdphvUOQhCvvOvUvvvphvPvpvhvv2MMsyCvvpvvvvv; isg=BC0t_ktfCrMy1e59MebiYaFzPMlnIn13kBESGm8yaUQz5k2YN9pxLHvc1Pql5nkU",
    "referer": "https://www.tmall.com/",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36",
}
url='https://list.tmall.com/search_product.htm?q=%BF%C2%C4%CF&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton'
request = requests.get(url, headers=headers)
response = request.content.decode("GBK")
html = etree.HTML(response)
# print(response)
list_id = html.xpath('//div/@data-id')
print(list_id)

实战案例--得到京东第一页的产品id

from urllib3 import PoolManager, disable_warnings
from bs4 import BeautifulSoup
import re
disable_warnings()
def get_product_id():
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Cookie": "__jdu=724477672; ipLoc-djd=1-72-2799-0; __jdv=122270672|baidu|-|organic|%25E4%25BA%25AC%25E4%25B8%259C|1530601873999; PCSYCityID=1; pinId=9I_Ff6ShTW8Iz8_pliZq3LV9-x-f3wj7; _tp=gOnyLLvJgMP8NGyEuqnusj5FFM%2Bcyy35ASysL6SW0mI%3D; _pst=jd_58d7ad672485e; pin=jd_58d7ad672485e; user-key=8bbbcecf-5481-4e15-80a1-c740f09dc772; cn=0; unick=XHuskie; TrackID=1jlvQ9rI6HtMiRRDluwXPNpHZIWrpoYptVIsuFror-P3aYfi2kGHtJx4nt4tD-GoqP_Yf9cPPr8hBt2yoFBgFXY3lfoPt6f7WG-9d3eYzqH0; xtest=4695.cf6b6759; mt_xid=V2_52007VwURV1heVF0ZSilcV2dWQFBZXk5SSExKQABjABNODQ1UDgNNHFxRZgURBlVeUVsvShhcDHsCFE5cX0NaHEIZVQ5kASJQbVhiUxlOGlQMbwAWVFteW1wfTxxeA1cDF1Ra; ipLocation=%u5317%u4EAC; areaId=1; __jda=122270672.724477672.1528337977.1530850682.1530877890.7; __jdc=122270672; rkv=V0900; qrsc=3; shshshfpa=9c89cef2-fade-6267-4f1f-a38a0619f0e9-1530878711; shshshfpb=1383b6cc8dc8b44b480c3bcd3d8268a3da61eee41c5449c095af58fe69; __jdb=122270672.6.724477672|7.1530877890; shshshfp=0ace2df06cd4f9d1ae9505bee7eb524e; shshshsID=12674382e7c342c95a5a8768535f2236_2_1530878717175; 3AB9D23F7A4B3C9B=BBPPFTMWLLBWMKYUPMVNQKOGEDNIUF67TOEUOMRZJGQFCI6E765GUNCYBFECNSF5TWZEWDPOTPA6KY3LYJHYEDOQNQ",
        "Host": "search.jd.com",
        "Referer": "https://www.jd.com/",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36",
    }
url="https://search.jd.com/Search?keyword=%E6%9F%AF%E5%8D%97&enc=utf-8&wq=%E6%9F%AF%E5%8D%97&pvid=ab06bca10d0a43cd8d598f03c92f92d3"
    http = PoolManager()
    response = http.request("get", url, headers=headers)
    html = response.data.decode("utf-8")
    print(html)
    soup = BeautifulSoup(html, "lxml")
    all_link = soup.select('div[class="p-img"] a')
    print(all_link)
    links = []
    id_list = []
    for link in all_link:
        link = link["href"]
        print(link)
        if link.__contains__("//item.jd.com/"):
            links.append(link)
    links = list(set(links))
    for link in links:
        id = link.replace("//item.jd.com/", "").replace(".html", "").replace("#comment", "")
        print(id)
        id_list.append(id)
    # print(id_list)
    id_list = list(set(id_list))
    # print(id_list)
return id_list

if __name__ == "__main__":
    list_id = get_product_id()
    print(list_id)

腾讯社招招聘信息

import requests
from bs4 import BeautifulSoup
def tencent():
    url='https://hr.tencent.com/position.php?&start=10'
    headers={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Cookie": "pgv_pvi=683008000; PHPSESSID=km4rkmampeugf0seuntmdib963; pgv_si=s5624891392",
        "Host": "hr.tencent.com",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36",
    }
    response=requests.get(url,headers=headers)
    text=response.text
    soup=BeautifulSoup(text,"lxml")
    odd=soup.select('tr[class="odd"]')
    even=soup.select('tr[class="even]')
    all_tag=odd+even
    list_postion=[]
    for tr in all_tag:
        item={}
        name=tr.td.a.string
        data_link=tr.td.a["href"]
        job_category=tr.select("td")[1].string
        recruit_number=tr.select("td")[2].string
        address=tr.select("td")[3].string
        publish_time=tr.select("td")[4].string

        item["name"]=name
        item["data_link"] =data_link
        item["job_category"] =job_category
        item["recruit_number"] =recruit_number
        item["address"] =address
        item["publish_time"] =publish_time
        list_postion.append(item)
        with open("json.txt","w") as f:
            f.write(str(list_postion))

if __name__ == '__main__':
    tencent()


获取拉勾网上所有的城市名并且保存

import requests
import jsonpath
import json
url="https://www.lagou.com/lbs/getAllCitySearchLabels.json"
response=requests.get(url)
text=response.text
text=json.loads(text)
name_city=jsonpath.jsonpath(text,"$..name")
f=open("city.json","w")
json.dump(name_city,f,ensure_ascii=False)



猜你喜欢

转载自blog.csdn.net/qq_42240071/article/details/80965294
今日推荐