给一个汽配图片网站,需要抓取某品类下的数据和图片。
步骤:
第一步: 品类网址下的所有item收集 title url
第二步: 根据item的url网址下 收集该item相关的title description fitment 图片url
第三步: 根据图片url下载图片并保存
代码实现:
第一步:品类网址下的所有item收集 title url
import re import json from bs4 import BeautifulSoup import pandas as pd import requests import os import random from pandas.io.json import json_normalize class IPProxy(): def __init__(self,count = 50): self.count = count def get_IPProxies(self): r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内') ip_ports = json.loads(r.text) proxy = random.choice(ip_ports) ip = proxy[0] port = proxy[1] proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} return proxies # print(proxies) def get_headers(self): USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] return { 'User-Agent': random.choice(USER_AGENTS), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate', } def get_html_content(self,url): try: proxies = self.get_IPProxies() headers = self.get_headers() r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r.text except Exception: count = 0 # 重试次数 while count < 3: try: proxies = self.get_IPProxies() headers = self.get_headers() r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r.text except Exception: count += 1 return None class image_structs(): def __init__(self): self.picture_url = { "image_id": '', "picture_url": '' } class data_structs(): def __init__(self): # columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment']) self.info={ "title":'', "item_url":'', "id":0, "picture_url":[], "std_desc":'', "description":'', "information":'', "fitment":'' } # "https://waldoch.com/store/catalogsearch/result/index/?cat=0&limit=200&p=1&q=nerf+bar" # https://waldoch.com/store/new-oem-ford-f-150-f150-5-running-boards-nerf-bar-crew-cab-2015-w-brackets-fl34-16451-ge5fm6.html def soup_parser(outfile="item_urls.xlsx"): result = [] for i in range(9): i = str(i + 1) with open(str(i)+".txt","r",encoding="utf-8") as fp: soup = fp.read() # print(soup) soup = BeautifulSoup(soup,"html.parser") alink = soup.find_all("a", class_="product-image") for a in alink: title = a["title"] item_url = a["href"] result.append([title, item_url]) df = pd.DataFrame(result, columns=["title", "item_url"]) print(len(df)) df = df.drop_duplicates() print(len(df)) df["id"] =df.index df.to_excel(outfile, index=False) def content_parser(soup): print(soup) soup = BeautifulSoup(soup, "html.parser") result = [] alink = soup.find_all("a", class_="product-image") for a in alink: title = a["title"] item_url = a["href"] result.append([title, item_url]) return result def get_item_list(outfile): pages = ["&p=%d" % n for n in list(range(1, 7))] # http://4x4sidesteps.co.uk/side-steps.html?limit=15&p=2 urls = ['http://4x4sidesteps.co.uk/side-steps.html?limit=15%s' % n for n in [''] + pages] print(urls) result = [] ips = IPProxy() for url in urls: soup = ips.get_html_content(url) res = content_parser(soup) print(res) result.extend(res) df = pd.DataFrame(result, columns=["title", "item_url"]) df = df.drop_duplicates() df["id"] =df.index df.to_excel( outfile, index=False) # print(soup) def get_item_info_4x4(file,outfile=""): DEFAULT_FALSE = "" df = pd.read_excel(file) for i in df.index: id = df.loc[i, "id"] item_url = df.loc[i, "item_url"] data = data_structs() data.info["title"] = df.loc[i, "title"] data.info["id"] = id data.info["item_url"] = item_url # if os.path.exists(str(int(id)) + ".xlsx"): # continue ips = IPProxy() soup = ips.get_html_content(item_url) print(soup) # 图片 try: soup = BeautifulSoup(soup, "html.parser") imglink = soup.find_all("img", class_=re.compile("^gallery-image")) for a in imglink: image = image_structs() image.picture_url["image_id"] = a["id"] image.picture_url["picture_url"] = a["src"] print(image.picture_url) data.info["picture_url"].append(image.picture_url) except: data.info["picture_url"] = DEFAULT_FALSE print(data.info) print(data.info.keys()) singledf = json_normalize(data.info, "picture_url",['title', 'id']) singledf.to_excel("test.xlsx", index=False) exit() # print(df.ix[i]) df.to_excel(outfile, index=False) # std_desc def get_item_info(file,outfile): DEFAULT_FALSE = "" df = pd.read_excel(file) for i in df.index: id = df.loc[i,"id"] if os.path.exists(str(int(id))+".xlsx"): continue item_url = df.loc[i,"item_url"] url = item_url web = requests.get(url) soup = BeautifulSoup(web.text, "html.parser") # 图片 imglink = soup.find_all("img", class_=re.compile("^gallery-image")) data = data_structs() data.info["title"] = df.loc[i,"title"] data.info["id"] = id data.info["item_url"] = item_url for a in imglink: image = image_structs() image.picture_url["image_id"] = a["id"] image.picture_url["picture_url"]=a["src"] print(image.picture_url) data.info["picture_url"].append(image.picture_url) print(data.info) # std_desc std_desc = soup.find("div", itemprop="description") try: strings_desc = [] for ii in std_desc.stripped_strings: strings_desc.append(ii) strings_desc = "\n".join(strings_desc) except: strings_desc=DEFAULT_FALSE # description try: desc = soup.find('h2', text="Description") desc = desc.find_next() except: desc=DEFAULT_FALSE description=desc # information try: information = soup.find("h2", text='Information') desc = information desc = desc.find_next() except: desc=DEFAULT_FALSE information = desc # fitment try: fitment = soup.find('h2', text='Fitment') desc = fitment desc = desc.find_next() except: desc=DEFAULT_FALSE fitment=desc data.info["std_desc"] = strings_desc data.info["description"] = str(description) data.info["information"] = str(information) data.info["fitment"] = str(fitment) print(data.info.keys()) singledf = json_normalize(data.info,"picture_url",['title', 'item_url', 'id', 'std_desc', 'description', 'information', 'fitment']) singledf.to_excel("test.xlsx",index=False) exit() # print(df.ix[i]) df.to_excel(outfile,index=False) # get_item_list("item_urls.xlsx") get_item_info_4x4("item_urls.xlsx") # soup_parser() # get_item_info("item_urls.xlsx","item_urls_info.xlsx")
第二步: 根据item的url网址下 收集该item相关的title description fitment 图片url
import random from http.cookiejar import CookieJar import requests from bs4 import BeautifulSoup import numpy as np import re from queue import Queue import time import os import random import threading import json import logging import pandas as pd from pandas.io.json import json_normalize class IPProxy(): def __init__(self,count = 50): self.count = count def get_IPProxies(self): r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内') ip_ports = json.loads(r.text) proxy = random.choice(ip_ports) ip = proxy[0] port = proxy[1] proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} return proxies # print(proxies) def get_headers(self): USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] return { 'User-Agent': random.choice(USER_AGENTS), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate', } class image_structs(): def __init__(self): self.picture_url = { "image_id": '', "picture_url": '' } class data_structs(): def __init__(self): # columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment']) self.info={ "title":'', "item_url":'', "id":0, "picture_url":[], "std_desc":'', "description":'', "information":'', "fitment":'' } class EbaySpider(object): def __init__(self,file=""): self.file= file def get_html_content(self,url): ips = IPProxy() proxies = ips.get_IPProxies() try: r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r.text except Exception: count = 0 # 重试次数 while count < 3: try: proxies = ips.get_IPProxies() r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r.text except Exception: count += 1 return None def get_item_info(self,item): # title item_url id DEFAULT_FALSE = "" id = item[2] item_url = item[1] data = data_structs() data.info["title"] = item[0] data.info["id"] = id data.info["item_url"] = item_url soup = self.get_html_content(item_url) # 图片 try: soup = BeautifulSoup(soup, "html.parser") imglink = soup.find_all("img", class_=re.compile("^gallery-image")) for a in imglink: image = image_structs() image.picture_url["image_id"] = a["id"] image.picture_url["picture_url"] = a["src"] print(image.picture_url) data.info["picture_url"].append(image.picture_url) except: data.info["picture_url"] = DEFAULT_FALSE singledf = json_normalize(data.info, "picture_url", ['title', 'id']) singledf.to_excel(self.file + "/"+str(int(id)) + ".xlsx", index=False) class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类 def __init__(self, queue,file): #子类特有属性, queue FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------" logging.basicConfig(level=logging.INFO, format=FORMAT) threading.Thread.__init__(self) self.queue = queue self.file=file self.spider = EbaySpider(self.file) #子类特有属性spider, 并初始化,将实例用作属性 def run(self): while True: success = True item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item self.spider.get_item_info(item) # 调用实例spider的方法getDataById(item) logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小 self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号 class EbaySpiderJob(): def __init__(self , size , qs ,file): self.size = size # 将形参size的值存储到属性变量size中 self.qs = qs self.file = file def work(self): toSpiderQueue = Queue() #创建一个Queue队列对象 for i in range(self.size): t = ThreadCrawl(toSpiderQueue,self.file) #将实例用到一个类的方法中 t.setDaemon(True) t.start() for q in self.qs: toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作 def combine_data(combine_file,outfile): # 合并数据 dataframe = [] for root, path, files in os.walk(combine_file): if not files: # 文件夹为空 return False for file in files: names = '//'.join([root, file]) temp = pd.read_excel(names) dataframe.append(temp) temp = pd.concat(dataframe) # urls=temp["url"].values temp.to_excel(outfile, index=False) return True def main(): # 创建文件夹 file = "info" outfile = "item_infos.xlsx" if not os.path.exists(file): os.makedirs(file) #读入数据 df = pd.read_excel("item_urls.xlsx") combinefile = combine_data(file, outfile) existitem = [] if combinefile: existdf = pd.read_excel(outfile) existitem = existdf["id"].unique() while len(existitem) != len(df): temp = df[~df["id"].isin(existitem)] # 不存在 取反 amazonJob = EbaySpiderJob(8,temp.values,file) #实例化对象 amazonJob.work() #调用对象方法 combine_data(file,outfile ) existdf = pd.read_excel(outfile) existitem = existdf["id"].unique() def single_test(): spider = EbaySpider() spider.get_item_info() if __name__ == '__main__': # file = "info" # single_test() main()
第三步: 根据图片url下载图片并保存
import random import urllib from http.cookiejar import CookieJar import requests from bs4 import BeautifulSoup import numpy as np import re from queue import Queue import time import os import random import threading import json import logging import pandas as pd from pandas.io.json import json_normalize class IPProxy(): def __init__(self,count = 50): self.count = count def get_IPProxies(self): r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内') ip_ports = json.loads(r.text) proxy = random.choice(ip_ports) ip = proxy[0] port = proxy[1] proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} return proxies # print(proxies) def get_headers(self): USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] return { 'User-Agent': random.choice(USER_AGENTS), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate', } class image_structs(): def __init__(self): self.picture_url = { "image_id": '', "picture_url": '' } class data_structs(): def __init__(self): # columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment']) self.info={ "title":'', "item_url":'', "id":0, "picture_url":[], "std_desc":'', "description":'', "information":'', "fitment":'' } class EbaySpider(object): def __init__(self,file=""): self.file= file def get_html_content(self,url): ips = IPProxy() proxies = ips.get_IPProxies() try: r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r.text except Exception: count = 0 # 重试次数 while count < 3: try: proxies = ips.get_IPProxies() r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r.text except Exception: count += 1 return None def get_html_response(self,url): ips = IPProxy() proxies = ips.get_IPProxies() try: r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r except Exception: count = 0 # 重试次数 while count < 3: try: proxies = ips.get_IPProxies() r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r except Exception: count += 1 return None def get_item_info(self, item): file = self.file picture_id = item[0] url = item[1] save_file= file + "//" + str(picture_id) + ".jpg" response = self.get_html_response(url) if response: with open(save_file, 'wb') as f: f.write(response.content) f.flush() class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类 def __init__(self, queue,file): #子类特有属性, queue FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------" logging.basicConfig(level=logging.INFO, format=FORMAT) threading.Thread.__init__(self) self.queue = queue self.file=file self.spider = EbaySpider(self.file) #子类特有属性spider, 并初始化,将实例用作属性 def run(self): while True: success = True item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item self.spider.get_item_info(item) # 调用实例spider的方法getDataById(item) logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小 self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号 class EbaySpiderJob(): def __init__(self , size , qs ,file): self.size = size # 将形参size的值存储到属性变量size中 self.qs = qs self.file = file def work(self): toSpiderQueue = Queue() #创建一个Queue队列对象 for i in range(self.size): t = ThreadCrawl(toSpiderQueue,self.file) #将实例用到一个类的方法中 t.setDaemon(True) t.start() for q in self.qs: toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作 def combine_file_name(combine_file): # 合并数据 result = [] for root, path, files in os.walk(combine_file): if not files: # 文件夹为空 return result for file in files: filename,exe = os.path.splitext(file) result.append(filename) return result def main(): # 创建文件夹 file = "picture" if not os.path.exists(file): os.makedirs(file) #读入数据 df = pd.read_excel("item_infos.xlsx") print(df.info()) for i in df.index: df.loc[i,"picture_id"] = str(df.loc[i,"id"])+"_"+str(df.loc[i,"image_id"]) df = df[["picture_id","picture_url"]] existitem = combine_file_name(file) while len(existitem) != len(df): temp = df[~df["picture_id"].isin(existitem)] # 不存在 取反 # #image_url 需要传入的字段 sku url id amazonJob = EbaySpiderJob(8,temp.values,file) #实例化对象 amazonJob.work() #调用对象方法 existitem = combine_file_name(file ) def single_test(): spider = EbaySpider() spider.get_item_info() if __name__ == '__main__': # file = "info" # single_test() main()