Because I want to make opinions , the house of opinions is similar to the topic of Zhihu, so I have to find a way to climb him down. After a long time, I finally got it properly. The code is written in python, and I don’t know how to troubleshoot myself! Know how to directly look at the code, absolutely usable
#coding:utf-8 """ @author:haoning @create time:2015.8.5 """ from __future__ import division # exact division from Queue import Queue from __builtin__ import False import json import them import re import platform import uuid import urllib import urllib2 import sys import time import MySQLdb as mdb from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding( "utf-8" ) headers = { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With':'XMLHttpRequest', 'Referer':'https://www.zhihu.com/topics', 'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a' } DB_HOST = '127.0.0.1' DB_USER = 'root' DB_PASS = 'root' queue= Queue() #Receive queue nodeSet=set() keywordSet=set() stop=0 offset=-20 level=0 maxLevel=7 counter=0 base="" conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8') conn.autocommit(False) curr = conn.cursor() def get_html(url): try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,3) #A proxy should be added here html = response.read() return html except: pass return None def getTopics(): url = 'https://www.zhihu.com/topics' print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ umbrella reading 屽簲 Xuan ュ姞 鍏 ヤ bluff. html = response.read().decode('utf-8') print html soup = BeautifulSoup(html) lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'}) for li in lis: data_id=li.get('data-id') name=li.text curr.execute('select id from classify_new where name=%s',(name)) y= curr.fetchone() if not y: curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name)) conn.commit() except Exception as e: print "get topic error",e def get_extension(name): where=name.rfind('.') if where!=-1: return name[where:len(name)] return None def which_platform(): sys_str = platform.system() return sys_str def GetDateString(): when=time.strftime('%Y-%m-%d',time.localtime(time.time())) foldername = str(when) return foldername def makeDateFolder(par,classify): try: if os.path.isdir(par): newFolderName=par + '//' + GetDateString() + '//' +str(classify) if which_platform()=="Linux": newFolderName=par + '/' + GetDateString() + "/" +str(classify) if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None def download_img(url,classify): try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,3) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"//"+name try: if "e82bab09c_m" in str(url): return True if not os.path.exists(filename): file_object = open(filename,'w+b') file_object.write(dataimg) file_object.close() return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #If it is not downloaded, use the link of the original website def getChildren(node,name): global queue,nodeSet try: url="https://www.zhihu.com/topic/"+str(node)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) p_ch='parent topic' node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text topic_cla=soup.find('div', {'class' : 'child-topic'}) if topic_cla is not None: try: p_ch=str(topic_cla.text) aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #Get all child nodes if u'subtopic' in p_ch: for a in aList: token=a.get('data-token') a=str(a).replace('\n','').replace('\t','').replace('\r','') start=str(a).find('>') end=str(a).rfind('</a>') new_node=str(str(a)[start+1:end]) curr.execute('select id from rooms where name=%s',(new_node)) #Make sure the names are never the same y= curr.fetchone() if not y: print "y=",y,"new_node=",new_node,"token=",token queue.put((token,new_node,node_name)) except Exception as e: print "add queue error",e except Exception as e: print "get html error",e def getContent(n,name,p,top_id): try: global counter curr.execute('select id from rooms where name=%s',(name)) #Make sure the names are never the same y= curr.fetchone() print "exist?? ",y,"n=",n if not y: url="https://www.zhihu.com/topic/"+str(n)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src') description=soup.find('div',{'class':'zm-editable-content'}) if description is not None: description=description.text if (u"uncategorized" in title or u"root topic" in title): #Allow storage to avoid infinite loop description=None tag_path=download_img(pic_path,top_id) print "tag_path=",tag_path if (tag_path is not None) or tag_path==True: if tag_path==True: tag_path=None father_id=2 #The default is chat curr.execute('select id from rooms where name=%s',(p)) results = curr.fetchall() for r in results: father_id=r[0] name=title curr.execute('select id from rooms where name=%s',(name)) #Make sure the names are never the same y= curr.fetchone() print "store see..",y if not y: friends_num=0 temp = time.time() x = time.localtime(float(temp)) create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now create_time creater_id=None room_avatar=tag_path is_pass=1 has_index=0 reason_id=None #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id #######################Content eligible for storage counter=counter+1 curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)) conn.commit() #Must enter the database from time to time, otherwise the parent node cannot be found if counter % 200==0: print "current node",name,"num",counter except Exception as e: print "get content error",e def work(): global queue curr.execute('select id,node,parent,name from classify where status=1') results = curr.fetchall() for r in results: top_id=r[0] node=r[1] parent=r[2] name=r[3] try: queue.put((node,name,parent)) #First put into the queue while queue.qsize() >0: n,p=queue.get() #Top node dequeue getContent(n,p,top_id) getChildren(n,name) #Children of dequeuing content conn.commit() except Exception as e: print "what's wrong",e def new_work(): global queue curr.execute('select id,data_id,name from classify_new_copy where status=1') results = curr.fetchall() for r in results: top_id=r[0] data_id=r[1] name=r[2] try: get_topis(data_id,name,top_id) except: pass def get_topis(data_id,name,top_id): global queue url = 'https://www.zhihu.com/node/TopicsPlazzaListV2' isGet = True; offset = -20; data_id=str(data_id) while isGet: offset = offset + 20 values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'} try: msg=None try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request,None,5) html=response.read().decode('utf-8') json_str = json.loads(html) ms=json_str['msg'] if len(ms) <5: break msg=ms[0] except Exception as e: print "eeeee",e #print msg if msg is not None: soup = BeautifulSoup(str(msg)) blks = soup.find_all('div', {'class' : 'blk'}) for blk in blks: page=blk.find('a').get('href') if page is not None: node=page.replace("/topic/","") # put more seeds into the database parent=name ne=blk.find('strong').text try: queue.put((node,ne,parent)) #First put into the queue while queue.qsize() >0: n,name,p=queue.get() #Top node dequeue size=queue.qsize() if size > 0: print size getContent(n,name,p,top_id) getChildren(n,name) #Children of dequeuing content conn.commit() except Exception as e: print "what's wrong",e except urllib2.URLError, e: print "error is",e pass if __name__ == '__main__': i=0 while i<400: new_work() i=i+1
Talking about the database problem, I will not upload the attachments here. I will create the fields myself, because this is really too simple. I use mysql, you can build it according to your own needs.
If you don't know anything, please go to the carousel.com to find me, because this is also developed by me.