import requests
import re
import xlwt
def main():
workbook = xlwt.Workbook(encoding='utf-8')
mysheet = workbook.add_sheet('mysheet')
mysheet.write(0, 0, '排名')
mysheet.write(0, 1, '图片地址')
mysheet.write(0, 2, '书名')
mysheet.write(0, 3, '评论数量')
mysheet.write(0, 4, '推荐指数')
mysheet.write(0, 5, '作者')
mysheet.write(0, 6, '出版社')
mysheet.write(0, 7, '价格')
i = 1
for page in range(1, 4):
url = 'http://bang.dangdang.com/books/bestsellers/1-' + str(page)
html = getHTMLText(url)
items = parse_result(html)
print(items)
for item in items:
No = item[0]
image = item[1]
book_title = item[2]
reviews = item[3]
recommend = item[4]
writer = item[5]
press = item[6]
price = item[7]
mysheet.write(i, 0, No)
mysheet.write(i, 1, image)
mysheet.write(i, 2, book_title)
mysheet.write(i, 3, reviews)
mysheet.write(i, 4, recommend)
mysheet.write(i, 5, writer)
mysheet.write(i, 6, press)
mysheet.write(i, 7, price)
i += 1
workbook.save('dangdang.xls')
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "异常"
def parse_result(html):
pattern = re.compile(
'<li>.*?list_num.*?>(.*?).</div>.*?<img.*?src="(.*?)".*?class="name".*?title="(.*?)".*?class="level".*?target="_blank">(.*?)</a>.*?class="tuijian">(.*?)</span>.*?class="publisher_info".*?title="(.*?)".*?</a>.*?class="publisher_info".*?target="_blank">(.*?)</a>.*?class="price_n">¥(.*?)</span>.*?</li>',
re.S)
items = re.findall(pattern, html)
return items
if __name__ == '__main__':
main()