利用Python批量抓取京东评论数据

京东图书评论有非常丰富的信息,这里面就包含了购买日期、书名、作者、好评、中评、差评等等。以购买日期为例,使用Python + Mysql的搭配进行实现,程序不大,才100行。相关的解释我都在程序里加注了:

 1 from selenium import webdriver
 2 from bs4 import BeautifulSoup
 3 import re
 4 import win32com.client
 5 import threading,time
 6 import MySQLdb
 7 
 8 def mydebug():
 9     driver.quit()
10     exit(0)
11 #Python群125240963每天更新学习资料
12 def catchDate(s):
13     """页面数据提取"""
14     soup = BeautifulSoup(s)
15     z = []
16     global nowtimes
17     
18     m = soup.findAll("div",class_="date-buy")
19     for obj in m:
20         try:
21             tmp = obj.find('br').contents
22         except Exception, e:
23             continue
24         if(tmp != ""):
25             z.append(tmp)
26             nowtimes += 1
27     return z
28 
29 def getTimes(n,t):
30     """获取当前进度"""
31     return "当前进度为:" + str(int(100*n/t)) + "%"
32 
33 
34 #———————————————————————————————————| 程序开始 |—————————————————————————————————
35 #确定图书大类
36 cate = {"3273":"历史","3279":"心理学","3276":"政治军事","3275":"国学古籍","3274":"哲学宗教","3277":"法律","3280":"文化","3281":"社会科学"}
37 
38 #断点续抓
39 num1 = input("bookid:")
40 num2 = input("pagenumber:")
41 
42 #生成图书大类链接,共需17355*20 = 347100次
43 totaltimes = 347100.0
44 nowtimes = 0
45 
46 #开启webdirver的PhantomJS对象
47 #driver = webdriver.PhantomJS()
48 driver = webdriver.Ie('C:\Python27\Scripts\IEDriverServer')
49 #driver = webdriver.Chrome('C:\Python27\Scripts\chromedriver')
50 
51 #读出Mysql中的评论页面,进行抓取
52 # 连接数据库 
53 try:
54     conn = MySQLdb.connect(host='localhost',user='root',passwd='',db='jd')
55 except Exception, e:
56     print e
57     sys.exit()
58 
59 # 获取cursor对象
60 cursor = conn.cursor()
61 sql = "SELECT * FROM booknew ORDER BY pagenumber DESC"
62 cursor.execute(sql)
63 alldata = cursor.fetchall()
64 
65 flag = 0
66 flag2 = 0
67 
68 # 如果有数据返回就循环输出,http://club.jd.com/review/10178500-1-154.html
69 if alldata:
70     for rec in alldata:
71         #rec[0]--bookid,rec[1]--cateid,rec[2]--pagenumber
72         if(rec[0] != str(num1) and flag == 0):
73             continue
74         else:
75             flag = 1
76         for p in range(num2,rec[2]):
77             if(flag2 == 0):
78                 num2 = 0
79                 flag2 = 1
80             p += 1
81             link = "http://club.jd.com/review/" + rec[0] + "-1-" + str(p) + ".html"
82             #抓网页
83             driver.get(link)
84             html = driver.page_source
85             #抓评论
86             buydate = catchDate(html)
87             #写入数据库
88             for z in buydate:
89                 sql = "INSERT INTO ljj (id, cateid, bookid, date) VALUES (NULL, '" + rec[0] + "','" + rec[1] + "','" + z[0] + "');"
90                 try:
91                     cursor.execute(sql)
92                 except Exception, e:
93                     print e
94             conn.commit()
95         print getTimes(nowtimes,totaltimes)
96 
97 driver.quit()
98 cursor.close()
99 conn.close()

猜你喜欢

转载自www.cnblogs.com/huohuohuo1/p/9175091.html
今日推荐