Python爬虫-Beautiful Soup-当当图书目录(1)

Python爬虫-Beautiful Soup-当当图书目录(1)

第一次用python + Beautiful Soup爬些数据,用当当图书目录作为练习了。

思路:
- 1. 获取当当的图书类别:类报名称 + 链接 url

效果:
采集的效果图

脚本说明:
- 1. mylog.py:日志
- 2. getBookKindInfo.py:图书目录


mylog.py

# !/usr/bin/env python
#-*- coding:utf-8 -*-
'''
Created on 2018-4-10
@author: Administrator
'''
import logging
import getpass
import sys

#### 定义MyLog类
class MyLog(object):
    """这个类用于创建一个自用的log"""
    def __init__(self):
        self.user = getpass.getuser()
        self.logger = logging.getLogger(self.user)
        self.logger.setLevel(logging.DEBUG)

        #### 日志文件名
        self.logFile = sys.argv[0][0:-3] + '.log'    #日志文件名
        self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s')

        """日志显示到屏幕上并输出到日志文件夹内"""
        logHand=logging.FileHandler(self.logFile)
        logHand.setFormatter(self.formatter)
        logHand.setLevel(logging.ERROR) #只有错误才会被记录到logfile中

        logHandSt=logging.StreamHandler()
        logHandSt.setFormatter(self.formatter)

        self.logger.addHandler(logHand)
        self.logger.addHandler(logHandSt)

    """日志的5个级别对应以下的5个函数"""
    def debug(self,msg):
        self.logger.debug(msg)

    def info(self,msg):
        self.logger.info(msg)

    def warn(self,msg):
        self.logger.warn(msg)

    def error(self,msg):
        self.logger.error(msg)

    def critical(self,msg):
        self.logger.critical(msg)

if __name__ == '__main__':
    mylog=MyLog()
    mylog.debug("我是一个debug")
    mylog.info("我是一个info")
    mylog.warn("我是一个warn")
    mylog.error("我是一个error")
    mylog.critical("我是一个critical")

getBookKindInfo.py

#! /usr/bin/env python
#-*- coding:utf-8 -*-

'''
Created on 2018-4-10
@author: Administrator
获取当当图书种类, 大类名称+大类url,小类名称+小类url
'''
import re
from bs4 import BeautifulSoup
import urllib2
from mylog import MyLog as mylog

class BookKindItem(object):
    '''图书种类'''
    name=None    # 种类名称 
    url=None         # 种类的url    

class GetBookKindItem(object):
    '''获取当当图书种类'''
    def getResponseContent(self,url):
        try:
            response=urllib2.urlopen(url.encode('utf8'))
        except:            
            self.log.error(u'python 返回 URL:%s 数据失败' %url)
        else:
            self.log.info(u'Python 返回URL:%s A数据成功' %url)
            return response.read()

    def __init__(self):
        self.urls=[ ]
        self.log=mylog()

#         self.getUrls()    

    def getUrls(self):
        URL=r'http://category.dangdang.com/?ref=www-0-C'
        htmlContent=self.getResponseContent(URL)
        soup=BeautifulSoup(htmlContent,'lxml', from_encoding='gbk')   #此处改为utf8则取不“全图书类别”

        #大类
        DL = []
        #小类  
        XL = []

        #outside  ---外层的div
        #_li      ---li层
        for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"):
            # 图书大类
            item_dl=BookKindItem();
            item_dl.name=outsideDiv.div.a.string;       
            item_dl.url=outsideDiv.div.a.get("href");            
            DL.append(item_dl);
#             for e in DL:
#                 print (' %s----%s' % (e.name,  e.url));

            # 图书小类
            for _li in outsideDiv.find("ul").find_all("li"):
                if _li.a.string == "更多":
                    continue
                else:
                    item_xl=BookKindItem();
                    item_xl.name=_li.a.string;       
                    item_xl.url=_li.a.get("href");            
                    XL.append(item_xl);
#                     for e in XL:
#                         print (' %s----%s' % (e.name,  e.url));
        return DL, XL

if __name__ == '__main__':
#     url=u'http://tieba.baidu.com/f?kw=%E6%9D%83%E5%8A%9B%E7%9A%84%E6%B8%B8%E6%88%8F&ie=utf-8&pn=50'
#     GTI=GetBookKindItem()    
    #首先获取相关链接从KindLinks
    kls=GetBookKindItem()
    #书籍的链接数据
    bdata=kls.getUrls()

    print (' ## 图书大类' );
    for e in bdata[0]:
        print (' %s----%s' % (e.name,  e.url));

    print (' ## 图书小类' );
    for e in bdata[1]:
        print (' %s----%s' % (e.name,  e.url));        

猜你喜欢

转载自blog.csdn.net/coolhe21cn/article/details/79915677
今日推荐