1.概要:
项目为了抓取淘宝中可能存在的侵犯明星肖像权的行为,目标获取店铺首页的图片
淘宝店铺首页:https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306
为了缩减数据量,只处理大类别
example(女装):https://shopsearch.taobao.com/search?app=shopsearch&spm=a230r.7195193.0.0.S9RdIQ&q=%E5%A5%B3%E8%A3%85&tracelog=shopsearchnoqcat&sort=sale-desc
按照销量排行,取了前120个店铺
通过selenium+phantomjs获取Page_source,
通过re模块获取了图片的链接地址
2.代码
from selenium import webdriver from bs4 import BeautifulSoup as bs import re import urllib from tkinter import * import threading #init driver with phantomJS driver = webdriver.PhantomJS() #new list store_list = [] #init total_count total_count = 0 #init mutex mutex = threading.Lock() def get_item_href(): driver = webdriver.PhantomJS() driver.get("https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306") href_list = [] for i in range(12): href_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).get_attribute('href')) href_list[i] += '&sort=sale-desc' print(href_list[i]) return href_list def get_shop_url(store_list,start_url): count = 0 while count <= 5: url = start_url + '&s=%s'%(count*20) driver.get(url) page = driver.page_source urls = re.findall(r'//shop\d+.taobao.com',page,re.I) for url in urls: url = get_total_url(url) if url not in store_list: store_list.append(url) count += 1 def get_img_url(shop_url): global total_count mutex.acquire() driver.set_window_size(25600,14400) driver.get(shop_url) page_source = driver.page_source img_urls = re.findall(r'[https:]?//gdp.alicdn.com/.*?.jpg',page_source,re.I) + re.findall(r'[https:]?//img.alicdn.com/.*?.jpg',page_source,re.I) #for i in img2_urls: # img_urls.append(i) download_path = r'C:\Users\Administrator\Pictures\test' for count in range(len(img_urls)): img_url = get_total_url(img_urls[count]) try: store_name = "%s"%total_count+"_"+"%s"%count #urllib.request.urlretrieve(img_url,download_path+"%s.jpeg"%store_name) print("download %s.jpeg"%store_name) #t.insert('1.0',"download %s.jpeg"%store_name) except Exception as e: print(e) pass total_count += 1 mutex.release() def get_total_url(url): if url.startswith('//'): url = 'https:' + url elif url.startswith('/'): url = 'https:/' + url else: url = url return url def print_url(store_list): for shop_url in store_list: print(shop_url,end = ',') def main(): href_list = get_item_href() for i in range(len(href_list)): start_url = href_list[i] get_shop_url(store_list,start_url) #print_url(store_list) #test for shop_url in store_list: print(shop_url) t = threading.Thread(target = get_img_url,args = (shop_url,)) t.start() t.join() #get_img_url(shop_url) main()