Python crawler-Beautiful Soup-Dangdang Book Catalog (1)-Advanced

Python crawler-Beautiful Soup-Dangdang Book Catalog (1)-Advanced

Ideas:
-1. 把公共的方法部分提取出来:getResponseContent(self,url);
-2.mylog.py不变,新增myfun.py脚本,修改getBookKindInfo.py脚本;

Script Description:
- 1. mylog.py: Logs
- 2. getBookKindInfo.py: Library catalog
- 3. myfun.py: log


myfun.py

#! /usr/bin/env python
#-*- coding:utf-8 -*-
'''
Created on 2018-4-12
@author: Administrator
'''
import urllib2
from mylog import MyLog as mylog

class MyFun(object):
    def __init__(self):        
        self.log=mylog()

    def getResponseContent(self,url):
        try:
            response=urllib2.urlopen(url.encode('utf8'))
        except:            
            self.log.error(u'python 返回 URL:%s 数据失败' %url)
        else:
            self.log.info(u'Python 返回URL:%s A数据成功' %url)
            return response.read()

getBookKindInfo.py

#! /usr/bin/env python
#-*- coding:utf-8 -*-

'''
Created on 2018-4-10
@author: Administrator
获取当当图书种类, 大类名称+大类url,小类名称+小类url
'''
import re
from bs4 import BeautifulSoup 
from myfun import MyFun as myfun

class BookKindItem(object):
    '''图书种类'''
    name=None    # 种类名称 
    url=None         # 种类的url    

class GetBookKindItem(object):  

    def __init__(self):
        self.urls=[ ]

    def getUrls(self):
        URL=r'http://category.dangdang.com/?ref=www-0-C'
        htmlContent=myfun().getResponseContent(URL)
        soup=BeautifulSoup(htmlContent,'lxml', from_encoding='gbk')   #此处改为utf8则取不“全图书类别”

        #大类
        DL = []
        #小类  
        XL = []

        #outside  ---外层的div
        #_li      ---li层
        for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"):
            # 图书大类
            item_dl=BookKindItem();
            item_dl.name=outsideDiv.div.a.string;       
            item_dl.url=outsideDiv.div.a.get("href");            
            DL.append(item_dl);
#             for e in DL:
#                 print (' %s----%s' % (e.name,  e.url));

            # 图书小类
            for _li in outsideDiv.find("ul").find_all("li"):
                if _li.a.string == "更多":
                    continue
                else:
                    item_xl=BookKindItem();
                    item_xl.name=_li.a.string;       
                    item_xl.url=_li.a.get("href");            
                    XL.append(item_xl);
#                     for e in XL:
#                         print (' %s----%s' % (e.name,  e.url));
        return DL, XL

if __name__ == '__main__':
#     url=u'http://tieba.baidu.com/f?kw=%E6%9D%83%E5%8A%9B%E7%9A%84%E6%B8%B8%E6%88%8F&ie=utf-8&pn=50'
#     GTI=GetBookKindItem()    
    #首先获取相关链接从KindLinks
    kls=GetBookKindItem()
    #书籍的链接数据
    bdata=kls.getUrls()

    print (' ## 图书大类' );
    for e in bdata[0]:
        print (' %s----%s' % (e.name,  e.url));

    print (' ## 图书小类' );
    for e in bdata[1]:
        print (' %s----%s' % (e.name,  e.url));        

Guess you like

Origin blog.csdn.net/coolhe21cn/article/details/79915874