抓包之get请求

爬虫之淘宝抓包法

案例一(数据格式处理)

import urllib.request
#url = 'https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%9F%A9%E5%95%86%E8%A8%80%E5%A3%81%E7%BA%B8&hs=2&pn=0&spn=0&di=41910&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&ie=utf-8&oe=utf-8&cl=2&lm=-1&cs=1904897900%2C1084212529&os=2553811481%2C1788484320&simid=3433783175%2C509998897&adpicid=0&lpn=0&ln=30&fr=ala&fm=&sme=&cg=&bdtype=0&oriquery=%E9%9F%A9%E5%95%86%E8%A8%80%E5%A3%81%E7%BA%B8&objurl=http%3A%2F%2Fimgboys1.yohobuy.com%2Fcmsimg01%2F2018%2F12%2F03%2F09%2F31%2F01edc98fd3de8e80ba1cdb9becfe372254.jpeg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3By5i5k7y_z%26e3Bv54AzdH3F27wg2AzdH3FgjofAzdH3Fl80b8_mla8n_8_z%26e3Bip4s&gsm=0&islist=&querylist='
url='https://suggest.taobao.com/sug?area=b2c&code=utf-8&k=1&src=tmall_h5&q=%E8%A1%AC%E8%A1%AB%E5%A5%B3%E7%9F%AD%E8%A2%96%20%E6%A3%89%E9%BA%BB'
response=urllib.request.urlopen(url)
html=response.read()
html=html.decode('utf-8')
html

运行结果如下:
在这里插入图片描述

import requests
strhtml=requests.get(url)
print(strhtml.text)

运行结果如下:
在这里插入图片描述

import requests
import urllib.request
import time
key_word="T恤"
#key_word_encode=urllib.requests.quote(key_word),pycharm中编码使用
url='https://suggest.taobao.com/sug?area=b2c&code=utf-8&k=1&src=tmall_h5&q={}'.format(key_word)
strhtml=requests.get(url)
import json
str_json=json.loads(strhtml.text)
for item in str_json['result']:
    print(item[0])

运行结果如下:
在这里插入图片描述

案例二(反反爬虫的几个方法)

import requests
import urllib.request
import time
import pandas as pd
result={'title':'title','level':'level'}
data=pd.DataFrame.from_dict(result,orient='index').T
data.to_csv("C:/Users/18487/Desktop/xlc.csv",index=False,header=False)#mode='a+'插写式
key_word="连衣裙"
key_word_encode=urllib.request.quote(key_word)#编码使用
#1通过伪装苹果等设备攻反爬
headers={'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'}
#3通过建立ip防被封
proxies={"http":"http://1.197.16.148:9999"}
url='https://suggest.taobao.com/sug?area=b2c&code=utf-8&k=1&src=tmall_h5&q={}'.format(key_word_encode)
strhtml=requests.get(url,headers=headers,proxies=proxies)
import json
str_json=json.loads(strhtml.text)
for item in str_json['result']:#一级下拉词
    result['title']=item[0]
    result['level']=1
    #data=pd.DataFrame.from_dict(result,orient='index').T
    #data.to_csv("C:/Users/18487/Desktop/xlc.csv",index=False,header=False,mode='a+')
    url2='https://suggest.taobao.com/sug?area=b2c&code=utf-8&k=1&src=tmall_h5&q={}'.format(urllib.request.quote(item[0]))
    strhtml2=requests.get(url2)
    str_json2=json.loads(strhtml2.text)
    for item2 in str_json2['result']:#二级下拉词
        #print(item2[0])
        result['title']=item2[0]
        result['level']=2
        data=pd.DataFrame.from_dict(result,orient='index').T
        data.to_csv("C:/Users/18487/Desktop/xlc.csv",index=False,header=False,mode='a+')
        #time.sleep(0.5) #2通过拖延时间攻反爬
发布了15 篇原创文章 · 获赞 1 · 访问量 354

猜你喜欢

转载自blog.csdn.net/wanerding/article/details/104607235