python-- crawling China Graduate Information Network transfers information

code show as below:

from DrawStu.DrawStu Import DrawStu;
 Import Time;
 Import IO
 Import SYS 
sys.stdout = io.TextIOWrapper (sys.stdout.buffer, encoding = ' GB18030 ' )
 # initialize the class object is obtained 
Draw = DrawStu ();
 IF  the __name__ == ' __main__ ' :
     Print ( ' crawling graduate toner information ' ); 
    size = draw.get_page_size ();
     Print (size)
     for X in Range (size): 
        Start = 50 * X;
        print(start);
        #print();
        created_url='https://yz.chsi.com.cn/kyzx/tjxx/?start='+str(start);
        draw.draw_base_list(created_url);
        
        pass
Import the sqlite3; 

class DB (Object):
     "" " implement database access methods " "" 
    "" " cursor object data generated api initialization operation conect operation " "" 
    DEF  the __init__ (Self): 
        self.conn = {}; 
        self.cus = {};
         # initialize a database link API 
        # . 1 generates a database link object 
        self.conn = sqlite3.connect (R & lt ' test.db ' );
         # 2. generated cursor operation 
        self.cus = self.conn. Cursor ();
         Pass ;
     DEF the create_table (Self):
        
        sql = " CREATE TABLE if not exists mynews (CrawlTime char,Title char,Content char,PublishTime char,Origin char)"
        self.conn.execute(sql)
        self.conn.commit()
        print('create table successfully')
    def insert_into_news(self,ops):
        self.conn.execute('insert into mynews(CrawlTime,Title,Content,PublishTime,Origin) values(?,?,?,?,?)',(ops['CrawlTime'],ops['Title'],ops['Content'],ops['PublishTime'],ops['Origin'],));
        self.conn.commit();
        pass
# Required urllib3 
Import the urllib.request;
 from BS4 Import the BeautifulSoup;
 from DB.DB Import DB; 

DB = DB ();
 Import Time;
 "" " crawling core of the core module, the function is only responsible for crawling Graduate toner information " " " 


class DrawStu ():
     " "" the docstring of DrawStu for "" " 
    DEF  the __init__ (Self): 
        self.baseurl = ' https://yz.chsi.com.cn/kyzx/tjxx/ ' ; 
        db.create_table (); 
        Pass ; 

    # extracted common api crawling information 
    defcommonsdk (Self, URL): 
        Response = the urllib.request.urlopen (URL); # Note that after writing into the internal parameter 
        HTML response.read = (); # Read process for distortion 
        Print (HTML); 
        DOC = the BeautifulSoup (HTML);
         return DOC; 


    # crawling basic list 
    DEF draw_base_list (Self, URL):
         Print ( ' URL IS ::: ' , URL); 
        DOC = self.commonsdk (URL); 
        lilist = doc.find ( ' UL ' , { ' class ' : ' News-List'}).findAll('li');
        #print(lilist);
        #爬取一级参数
        for x in lilist:
            Title=x.find('a').text;
            Time=x.find('span').text
            Link='https://yz.chsi.com.cn'+x.find('a').get('href');
            #print(Link);
            self.draw_detail_list(Link,Title,Time);
            pass

        pass

    #爬取二级详情的信息参数
    def draw_detail_list(self,url,Title,Time):
        doc=self.commonsdk(url);
        from_info=doc.find('span',{'class':'news-from'}).text;
        
        content=doc.find('div',{'class':'content-l detail'}).text;
        
        ctime=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime());
        
        # The synthesis dictionary data piece stored in the database to the API 
        Data = {
           ' CrawlTime ' : the ctime,
           ' the Title ' : the Title,
           ' the Content ' : Content,
           ' PublishTime ' : Time,
           ' Origin ' : from_info 
        } 
        Print (Data);
         Print ( ' into the database ' ); 

        db.insert_into_news (Data); 
        Pass 

    # pages page crawling 
    DEF get_page_size (Self): 
        requesturl =self.baseurl; 
        pcxt = self.commonsdk (requesturl) .find ( ' div ' , { ' class ' : ' pageC ' .}) the findAll ( ' span ' ) [0] .text;
         Print (pcxt);
         # Re canonical expression string interception API 
        pageSize = pcxt.strip (); 
        pagearr = pagesize.split ( ' / ' ); 
        pagestr = pagearr [. 1 ];
         return int (pagestr [0: 2 ]);
         Pass
        
        

F12 View page elements

Crawling results:

 

 Converted into the form of database tables using software database net effect is as follows:

New Query Input: select * from mynews

 

 

 

 Which recorded each school can query information

Guess you like

Origin www.cnblogs.com/yezishen/p/11923873.html