项目进展:淘宝店铺抓取

1.概要:

项目为了抓取淘宝中可能存在的侵犯明星肖像权的行为,目标获取店铺首页的图片

淘宝店铺首页:https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306

为了缩减数据量,只处理大类别

example(女装):https://shopsearch.taobao.com/search?app=shopsearch&spm=a230r.7195193.0.0.S9RdIQ&q=%E5%A5%B3%E8%A3%85&tracelog=shopsearchnoqcat&sort=sale-desc

按照销量排行,取了前120个店铺

通过selenium+phantomjs获取Page_source,

通过re模块获取了图片的链接地址

2.代码

from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
import urllib
from tkinter import *
import threading

#init driver with phantomJS
driver = webdriver.PhantomJS()

#new list
store_list = []

#init total_count
total_count = 0

#init mutex
mutex = threading.Lock()

def get_item_href():
    driver = webdriver.PhantomJS()
    driver.get("https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306")
    href_list = []
    for i in range(12):
        href_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).get_attribute('href'))
        href_list[i] += '&sort=sale-desc'
        print(href_list[i])
    return href_list

def get_shop_url(store_list,start_url):
    count = 0
    while count <= 5:
        url = start_url + '&s=%s'%(count*20)
        driver.get(url)
        page = driver.page_source
        urls = re.findall(r'//shop\d+.taobao.com',page,re.I)
        for url in urls:
            url = get_total_url(url)
            if url not in store_list:
                store_list.append(url)
        count += 1

def get_img_url(shop_url):
    global total_count
    mutex.acquire()
    driver.set_window_size(25600,14400)
    driver.get(shop_url)
    page_source = driver.page_source
    img_urls = re.findall(r'[https:]?//gdp.alicdn.com/.*?.jpg',page_source,re.I) + re.findall(r'[https:]?//img.alicdn.com/.*?.jpg',page_source,re.I)
    #for i in img2_urls:
    #    img_urls.append(i)
    download_path = r'C:\Users\Administrator\Pictures\test'
    for count in range(len(img_urls)):
        img_url = get_total_url(img_urls[count])
        try:
            store_name = "%s"%total_count+"_"+"%s"%count
            #urllib.request.urlretrieve(img_url,download_path+"%s.jpeg"%store_name)
            print("download %s.jpeg"%store_name)
            #t.insert('1.0',"download %s.jpeg"%store_name)
        except Exception as e:
            print(e)
            pass
    total_count += 1
    mutex.release()

def get_total_url(url):
    if url.startswith('//'):
        url = 'https:' + url
    elif url.startswith('/'):
        url = 'https:/' + url
    else:
        url = url
    return url

def print_url(store_list):
    for shop_url in store_list:
        print(shop_url,end = ',')

def main():
    href_list = get_item_href()
    for i in range(len(href_list)):
        start_url = href_list[i]
        get_shop_url(store_list,start_url)
        #print_url(store_list) #test
        for shop_url in store_list:
            print(shop_url)
            t = threading.Thread(target = get_img_url,args = (shop_url,))
            t.start()
            t.join()
            #get_img_url(shop_url)

main()

猜你喜欢

转载自blog.csdn.net/mr_guo_lei/article/details/78529515