code show as below:
from DrawStu.DrawStu Import DrawStu; Import Time; Import IO Import SYS sys.stdout = io.TextIOWrapper (sys.stdout.buffer, encoding = ' GB18030 ' ) # initialize the class object is obtained Draw = DrawStu (); IF the __name__ == ' __main__ ' : Print ( ' crawling graduate toner information ' ); size = draw.get_page_size (); Print (size) for X in Range (size): Start = 50 * X; print(start); #print(); created_url='https://yz.chsi.com.cn/kyzx/tjxx/?start='+str(start); draw.draw_base_list(created_url); pass
Import the sqlite3; class DB (Object): "" " implement database access methods " "" "" " cursor object data generated api initialization operation conect operation " "" DEF the __init__ (Self): self.conn = {}; self.cus = {}; # initialize a database link API # . 1 generates a database link object self.conn = sqlite3.connect (R & lt ' test.db ' ); # 2. generated cursor operation self.cus = self.conn. Cursor (); Pass ; DEF the create_table (Self): sql = " CREATE TABLE if not exists mynews (CrawlTime char,Title char,Content char,PublishTime char,Origin char)" self.conn.execute(sql) self.conn.commit() print('create table successfully') def insert_into_news(self,ops): self.conn.execute('insert into mynews(CrawlTime,Title,Content,PublishTime,Origin) values(?,?,?,?,?)',(ops['CrawlTime'],ops['Title'],ops['Content'],ops['PublishTime'],ops['Origin'],)); self.conn.commit(); pass
# Required urllib3 Import the urllib.request; from BS4 Import the BeautifulSoup; from DB.DB Import DB; DB = DB (); Import Time; "" " crawling core of the core module, the function is only responsible for crawling Graduate toner information " " " class DrawStu (): " "" the docstring of DrawStu for "" " DEF the __init__ (Self): self.baseurl = ' https://yz.chsi.com.cn/kyzx/tjxx/ ' ; db.create_table (); Pass ; # extracted common api crawling information defcommonsdk (Self, URL): Response = the urllib.request.urlopen (URL); # Note that after writing into the internal parameter HTML response.read = (); # Read process for distortion Print (HTML); DOC = the BeautifulSoup (HTML); return DOC; # crawling basic list DEF draw_base_list (Self, URL): Print ( ' URL IS ::: ' , URL); DOC = self.commonsdk (URL); lilist = doc.find ( ' UL ' , { ' class ' : ' News-List'}).findAll('li'); #print(lilist); #爬取一级参数 for x in lilist: Title=x.find('a').text; Time=x.find('span').text Link='https://yz.chsi.com.cn'+x.find('a').get('href'); #print(Link); self.draw_detail_list(Link,Title,Time); pass pass #爬取二级详情的信息参数 def draw_detail_list(self,url,Title,Time): doc=self.commonsdk(url); from_info=doc.find('span',{'class':'news-from'}).text; content=doc.find('div',{'class':'content-l detail'}).text; ctime=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime()); # The synthesis dictionary data piece stored in the database to the API Data = { ' CrawlTime ' : the ctime, ' the Title ' : the Title, ' the Content ' : Content, ' PublishTime ' : Time, ' Origin ' : from_info } Print (Data); Print ( ' into the database ' ); db.insert_into_news (Data); Pass # pages page crawling DEF get_page_size (Self): requesturl =self.baseurl; pcxt = self.commonsdk (requesturl) .find ( ' div ' , { ' class ' : ' pageC ' .}) the findAll ( ' span ' ) [0] .text; Print (pcxt); # Re canonical expression string interception API pageSize = pcxt.strip (); pagearr = pagesize.split ( ' / ' ); pagestr = pagearr [. 1 ]; return int (pagestr [0: 2 ]); Pass
F12 View page elements
Crawling results:
Converted into the form of database tables using software database net effect is as follows:
New Query Input: select * from mynews
Which recorded each school can query information