Python输入关键词批量得到电商商品信息

暑假在家帮家里写了点小代码,分享一下同时也备份一下
拼多多好像是反爬比较聪明老是爬不到,还是我太菜了

淘宝


 # -*- coding: utf-8 -*-
import requests
import re
import pandas as pd
import time
import xlwt
import os

# 此处写入登录之后自己的cookies
cookie = input('请输入想查询的商品的cookie:'.strip()) 
# 获取页面信息
def getHTMLText(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}
    user_cookies = cookie
    cookies = {}
    for a in user_cookies.split(';'):  # 因为cookies是字典形式,所以用spilt函数将之改为字典形式
        name, value = a.strip().split('=', 1)
        cookies[name] = value
    try:
        r = requests.get(url, cookies=cookies, headers=headers, timeout=60)
        print(r.status_code)
        print(r.cookies)
        return r.text
    except:
        print('获取页面信息失败')
        return ''

#  格式化页面,查找数据
def parsePage(html):
    list = []
    try:
        views_title = re.findall('"raw_title":"(.*?)","pic_url"', html)
        print(len(views_title))  # 打印检索到数据信息的个数,如果此个数与后面的不一致,则数据信息不能加入列表
        print(views_title)
        views_price = re.findall('"view_price":"(.*?)","view_fee"', html)
        print(len(views_price))
        print(views_price)
        item_loc = re.findall('"item_loc":"(.*?)","view_sales"', html)
        print(len(item_loc))
        print(item_loc)
        views_sales = re.findall('"view_sales":"(.*?)","comment_count"', html)
        print(len(views_sales))
        print(views_sales)
        comment_count = re.findall('"comment_count":"(.*?)","user_id"', html)
        print(len(comment_count))
        print(comment_count)
        shop_name = re.findall('"nick":"(.*?)","shopcard"', html)
        print(len(shop_name))
        for i in range(len(views_price)):
            list.append([views_title[i], views_price[i], item_loc[i], comment_count[i], views_sales[i], shop_name[i]])
        # print(list)
        print('爬取数据成功')
        return list
    except:
        print('有数据信息不全,如某一页面中某一商品缺少地区信息')

# 存储到csv文件中,为接下来的数据分析做准备
def save_to_file(list):
    data = pd.DataFrame(list)
    data.to_csv('C:\\Users\\Administrator\\Desktop\\商品数据.csv', header=False, mode='a+')  # 用追加写入的方式
# csv转存为excel
def txt_xls(filename, xlsname):
    try:
        f = open(filename, 'r',encoding='utf-8')
        xls = xlwt.Workbook()
        sheet =xls.add_sheet('sheet1',cell_overwrite_ok=True)
        x=0
        
        
        while True:
            line =f.readline()
            if not line:
                break
#split(';')表示csv文件内容以“;”分割
            for i in range(len(line.split(';'))):
                item = line.split(';')[i]
                sheet.write(x,i,item)
            x+=1 
        f.close()
        xls.save(xlsname)
    except:
          raise
def main():
    name = [['views_title', 'views_price', 'item_loc', 'comment_count', 'views_sales', 'shop_name']]
    data_name = pd.DataFrame(name)
    data_name.to_csv('C:\\Users\\Administrator\\Desktop\\商品数据.csv', header=False, mode='a+')  # 提前保存一行列名称
    goods = input('请输入想查询的商品名称:'.strip())  # 输入想搜索的商品名称
    print('注意需要输入和上面cookie所对应的商品名称')
    depth = 5  # 爬取的页数
    start_url = 'http://s.taobao.com/search?q=' + goods  # 初始搜索地址
    for i in range(depth):
        time.sleep(3 + i)
        try:
            page = i + 1
            print('桐:正在爬取第%s页数据' % page)
            url = start_url + 'imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200408&ie=utf8&sort=sale-desc&bcoffset=0&p4ppushleft=%2C44&s=' + str(44 * i)
            html = getHTMLText(url)
            # print(html)
            list = parsePage(html)
            save_to_file(list)
        except:
            print('数据没保存成功')

if __name__ == '__main__':
    main()
    filename = "C:\\Users\\Administrator\\Desktop\\商品数据.csv"
    xlsname ="C:\\Users\\Administrator\\Desktop\\商品数据.xls"
    txt_xls(filename,xlsname)
    os.remove(filename)

京东

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree
from openpyxl import Workbook
wb = Workbook()
sheet = wb.active
sheet['A1'] = 'name'
sheet['B1'] = 'price'
sheet['C1'] = 'commit'
sheet['D1'] = 'shop'
sheet['E1'] = 'sku'
sheet['F1'] = 'icons'
sheet['G1'] = 'detail_url'
driver_path = r"C:\Users\Administrator\Desktop\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')

options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
driver = webdriver.Chrome(chrome_options=options)

# 不加载图片
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
driver = webdriver.Chrome(executable_path=driver_path, options=options)
wait = WebDriverWait(driver, 60)  # 设置等待时间
def search(keyword):
    try:
        input = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key"))
        )  # 等到搜索框加载出来
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button"))
        )  # 等到搜索按钮可以被点击
        input[0].send_keys(keyword)  # 向搜索框内输入关键词
        submit.click()  # 点击
        wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')
            )
        )
        total_page = driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[2]/em[1]/b').text
        return int(total_page)
    except TimeoutError:
        search(keyword)


def get_data(html):
    selec_data = etree.HTML(html)
    lis = selec_data.xpath('//ul[@class="gl-warp clearfix"]/li')
    for li in lis:
        try:
            title = li.xpath('.//div[@class="p-name p-name-type-2"]//em/text()')[0].strip()   # 名字
            price = li.xpath('.//div[@class="p-price"]//i/text()')[0].strip()   # 价格
            comment = li.xpath('.//div[@class="p-commit"]//a/text()')  # 评论数
            shop_name = li.xpath('.//div[@class="p-shop"]//a/text()')  # 商铺名字
            data_sku = li.xpath('.//div[@class="p-focus  "]/a/@data-sku')[0] if li.xpath('.//div[@class="p-focus  "]/a/@data-sku') else None  # 商品唯一id
            icons = li.xpath('.//div[@class="p-icons"]/i/text()')  # 备注
            comment = comment[0] if comment != [] else ''
            shop_name = shop_name[0] if shop_name != [] else ''
            icons_n = ''
            for x in icons:
                icons_n = icons_n + ',' + x
            detail_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href')[0]  # 详情页网址
            detail_url = 'https:' + detail_url
            item = [title, price, comment, shop_name, data_sku, icons_n[1:], detail_url]
            print(item)
            sheet.append(item)
        except TimeoutError:
            get_data(html)
def main():
    url_main = 'https://www.jd.com/'
    keyword = input('请输入商品名称:')  # 搜索关键词
    driver.get(url=url_main)
    page = search(keyword)
    j = 1
    
    for i in range(3, page*2, 2):
        
        if j == 1:
            url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, i, j)
        else:
            url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, i, (j-1)*50)
        driver.get(url)
        time.sleep(1)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")  # 下滑到底部
        time.sleep(3)
        driver.implicitly_wait(20)
        wait.until(
            EC.presence_of_all_elements_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li[last()]'))
        )
        
        html = driver.page_source
        get_data(html)
        time.sleep(1)
        print(f'正在爬取第{j}页')
        j += 1
        if j == 20:
            break 
    wb.save('京东{}信息.xlsx'.format(keyword))
    print('桐:>爬取成功啦<')
if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/TongOuO/article/details/126278706
今日推荐