Crawling Taobao "Phone Info"

Open Taobao, we search the phone, returned the following interface

 

 Next we come crawling data

 

 The first step we first get the page html

html = requests.get(url,headers=headers)
    print(html.text)

After printing the results did not find the required information in the page code, so it should be dynamically loaded, then network here in one by one to find, find json file it returns, is one such.

 

 

Finally, the content can be found in a json file here next is very simple, we just need to parse json data we need to find me on it, this part of the package together.

 
 
def get_data(url):
html = requests.get(url,headers=headers,)
html_text = html.text
starts = html_text.find('{"pageName":"mainsrp"')
end = html_text.find('"shopcardOff":true}}')+len('"shopcardOff":true}}')
json_data = json.loads(html_text[starts:end])
get_json_data = json_data['mods']['itemlist']['data']['auctions']
for data in get_json_data:
title = data['title']
item_loc = data['item_loc']
view_sales = data['view_sales']
nick = data['nick']
view_price = data['view_price']
pic_url = data['pic_url']
pic_url = parse.urljoin('http:',pic_url)
print (title, '\ n' , item_loc, '\ n', view_sales, ' shop:', nick, "price:", view_price)
download (pic_url)
Print ( '-' * 80)
 

The results are as follows

 

 

 

 Here we will be one of Pawan, here we all are trying to climb down, here we click on the second page

https://s.taobao.com/search?ie=utf8&initiative_id=staobaoz_20200402&stats_click=search_radio_all%3A1&js=1&imgfile=&q=%E6%89%8B%E6%9C%BA&suggest=history_2&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44

https://s.taobao.com/search?ie=utf8&initiative_id=staobaoz_20200402&stats_click=search_radio_all%3A1&js=1&imgfile=&q=%E6%89%8B%E6%9C%BA&suggest=history_2&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0

We can see on the next url to the previous page s + 44 so that we can climb up next

1     for each in range(0,1000,44):
2         url = 'https://s.taobao.com/search?q=%E6%89%8B%E6%9C%BA&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&sort=sale-desc&bcoffset=0&p4ppushleft=%2C44&s={}'.format(each)

OK fully resolved

The complete code is as follows:

 1 import requests,json,lxml,os
 2 from lxml import etree
 3 from urllib import parse
 4 from uuid import uuid4
 5 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 Edg/80.0.361.69'
 6      ,'cookie':自己的cookie
 7                 }
 8 def get_data(url):
 9     html = requests.get(url,headers=headers,)
10     html_text = html.text
11     starts = html_text.find('{"pageName":"mainsrp"')
12     end = html_text.find('"shopcardOff":true}}')+len('"shopcardOff":true}}')
13     json_data = json.loads(html_text[starts:end])
14     get_json_data = json_data['mods']['itemlist']['data']['auctions']
15     for data in get_json_data:
16         title = data['title']
17         item_loc = data['item_loc']
18         view_sales = data['view_sales']
19         nick = data['nick']
20         view_price = data['view_price']
21         pic_url = data['pic_url']
22         pic_url = parse.urljoin('http:',pic_url)
23         print(title,'\n',item_loc,'\n',view_sales,'店铺:',nick,"价格:",view_price)
24         download(pic_url)
25         print('-'*80)
26 
27 def download(url):
28     response = requests.get(url)
29     img = response.content
30     with open('文件路径{}.jpg'.format(uuid4()),'wb') as f:
31         f.write(img)
32 
33 
34 if __name__ == '__main__':
35     for each in range(0,1000,44):
36         url = 'https://s.taobao.com/search?q=%E6%89%8B%E6%9C%BA&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&sort=sale-desc&bcoffset=0&p4ppushleft=%2C44&s={}'.format(each)
37 
38         get_data(url)

 

Guess you like

Origin www.cnblogs.com/Truedragon/p/12621438.html