from queue import Queue import time import random import threading import numpy as np import logging import pymysql import pandas as pd import requests import json import re from bs4 import BeautifulSoup class IPProxy(): def __init__(self,count = 50): self.count = count def get_IPProxies(self): r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内') ip_ports = json.loads(r.text) proxy = random.choice(ip_ports) ip = proxy[0] port = proxy[1] proxies = {"http": "http://%s:%s" % (ip, port), "https": "https://%s:%s" % (ip, port)} return proxies # print(proxies) def get_headers(self): USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] return { 'User-Agent': random.choice(USER_AGENTS), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate', } def get_html_content(self,url): try: proxies= self.get_IPProxies() headers = self.get_headers() r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r.text except Exception: count = 0 # 重试次数 while count < 3: try: proxies = self.get_IPProxies() headers = self.get_headers() r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies) if (not r.ok) or len(r.content) < 500: raise Exception("连接错误") else: return r.text except Exception: count += 1 return None class Database(): # 每次实例化的时候,python自动运行_init_ # 类中的每个属性都必须有初始值 def __init__(self): #连接到数据库 # 设置本地数据库用户名和密码 self.table_name = "aces" self.host = "localhost" self.user="root" self.password = "123456" self.database="test" self.port = 3306 self.charset="utf8" self.connet = pymysql.connect(host = self.host , user = self.user,password = self.password , database = self.database, charset = self.charset) self.cursor = self.connet.cursor() # #删表 def dropTables(self): sql = 'drop table if exists '+self.table_name self.cursor.execute(sql) print("删表") #建表 def createTables(self): sql = 'create table if not exists '+self.table_name+ ''' ( asin varchar(11) primary key not null, checked varchar(200) )''' self.cursor.execute(sql) print("建表") #保存数据 def save(self,aceslist): sql = 'insert into '+self.table_name+' ( asin, checked) values(%s,%s)' self.cursor.execute(sql, (aceslist[0],aceslist[1])) self.connet.commit() def select_all(self): sql = "select asin from "+self.table_name self.cursor.execute(sql) result = self.cursor.fetchall() if len(result) == 0: return [] df = pd.DataFrame(np.array(result),columns=["asin"]) return df["asin"].values #判断元素是否已经在数据库里,在就返回true ,不在就返回false def is_exists_asin(self,asin): sql = 'select * from '+self.table_name+' where asin = %s' self.cursor.execute(sql,asin) if self.cursor.fetchone() is None: return False return True #一个模块中存储多个类 AmazonSpeder , ThreadCrawl(threading.Thread), AmazonSpiderJob class AmazonSpider(): def __init__(self): self.db = Database() def getDataById(self , queryId): #如果数据库中有的数据,直接返回不处理 if self.db.is_exists_asin(queryId): return DEFAULT_FALSE = "" ips = IPProxy() url = "https://www.amazon.com/dp/" + str(queryId) html = ips.get_html_content(url="https://www.amazon.com/dp/"+str(queryId)) try: soup = BeautifulSoup(html, 'html.parser') content = soup.find_all("span" , id = "asTitle") state = content[0].string print(queryId,state) self.db.save([queryId, state]) except: state = DEFAULT_FALSE pass class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类 def __init__(self, queue): #子类特有属性, queue FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "-----%(message)s------" logging.basicConfig(level=logging.INFO, format=FORMAT) # threading.Thread.__init__(self) #初始化父类的属性 super().__init__() self.queue = queue self.spider = AmazonSpider() #子类特有属性spider, 并初始化,将实例用作属性 def run(self): while True: success = True item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item try: self.spider.getDataById(item) #调用实例spider的方法getDataById(item) except : # print("失败") success = False if not success : self.queue.put(item) logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小 self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号 class AmazonSpiderJob(): def __init__(self , size , qs): self.size = size # 将形参size的值存储到属性变量size中 self.qs = qs def work(self): toSpiderQueue = Queue() #创建一个Queue队列对象 for q in self.qs: toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item for i in range(self.size): t = ThreadCrawl(toSpiderQueue) #将实例用到一个类的方法中 t.setDaemon(True) t.start() toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作 def db_init(): # 初次跑程序的时候,需要删除旧表,然后新建表,之后重启再跑的时候需要注释 # ---------------------- db = Database() db.dropTables() db.createTables() def db_select(): db = Database() return db.select_all() def main(): # --------------------------- existitem =db_select() df = pd.read_excel("asin_20180108.xlsx") print(df.info()) while len(existitem) != len(df): temp = df[~df["asin"].isin(existitem)] # print(temp.info()) # exit() amazonJob = AmazonSpiderJob(8, temp["asin"].values) amazonJob.work() existitem = db_select() def single_test(): spider = AmazonSpider() spider.getDataById("B00IJ5TTCS") if __name__ == '__main__': # db_init() # single_test() main()
amazon asin检查
猜你喜欢
转载自blog.csdn.net/zn505119020/article/details/79002290
今日推荐
周排行