requests和re库爬取淘宝商品信息

这次还是静态网页但是是比较复杂的淘宝，我在直接请求时发现我得到的结果并不是网页response的text的结果，以为是不能直接请求，结果在看大佬的博客时发现他爬取的有结果，然后我试着把他的代码复制下来输出了他的response的text信息发现和我一样，所以之后正则时都是在开发者工具中直接观察的，包括商品名称价格和付款人数。这个可能太麻烦而且不好提取信息，今天安装Chromedriver出现了问题，等问题解决之后会用Selenium爬取淘宝商品信息…
以下是全部代码：

import requests
import re

goods = '裙子'#可以改变为其他商品，加入到链接中
sum = 1#全局变量用来统计信息个数

def get_html(url):
    headers = {
        'path': '/api?_ksTS=1533797297452_266&callback=jsonp267&ajax=true&m=customized&q=%E8%A3%99%E5%AD%90&s=36&bcoffset=-1&rn=eedf8f96dfb926ef01ced89a2a7b49d2',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    response = requests.get(url,headers = headers)
    if response.status_code == 200:
        return response.text
    else :
        print('error!')

def get_message(list,html):
    price = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
    title = re.findall(r'\"raw_title\"\:\".*?\"', html)
    sales = re.findall(r'\"view_sales\"\:\".*?\"', html)
    for i in range(len(plt)):
        price = eval(price[i].split(':')[1])  # 目的是进行分割其冒号
        title = eval(title[i].split(':')[1])
        sale = eval(sales[i].split(':')[1])
        list.append([price, sale, title])#列表的嵌套

def print_mes(list,file):
    global  sum
    tplt = "{:4}\t{:8}\t{:10}\t{:16}"
    print(tplt.format("序号", "价格", "付款人数", "商品名称"))  # 输出信息
    count = 0
    for mes in list:
        count = count + 1
        print(tplt.format(count, mes[0], mes[1], mes[2]))
        with open(file,'a',encoding='utf8') as f:
            print('第' + str(sum) + '数据正在存储...')
            f.write(str(sum)+'\t'+mes[0]+'\t'+mes[1]+'\t'+mes[2]+'\n')
            sum += 1

def main():
    global goods
    find = goods
    file = '淘宝.txt'
    url = 'https://s.taobao.com/search?q='+find#商品的链接
    for i in range(0,100):
        print('\t\t\t\t第'+str(i+1)+'页')
        url = url + '&s=' +str(44 * i)
        html = get_html(url)
        List = []
        get_message(List,html)
        print_mes(List,file)

main()

requests和re库爬取淘宝商品信息

猜你喜欢