The crawler crawls the topic of Zhihu

Because I want to make opinions , the house of opinions is similar to the topic of Zhihu, so I have to find a way to climb him down. After a long time, I finally got it properly. The code is written in python, and I don’t know how to troubleshoot myself! Know how to directly look at the code, absolutely usable

#coding:utf-8
"""
@author:haoning
@create time:2015.8.5
"""
from __future__ import division # exact division
from Queue import Queue
from __builtin__ import False
import json
import them
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
   'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',
   'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
   'X-Requested-With':'XMLHttpRequest',
   'Referer':'https://www.zhihu.com/topics',
   'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'
}

DB_HOST = '127.0.0.1'
DB_USER = 'root'
DB_PASS = 'root'

queue= Queue() #Receive queue
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req,None,3) #A proxy should be added here
        html = response.read()
        return html
    except:
        pass
    return None

def getTopics():
    url = 'https://www.zhihu.com/topics'
    print url
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req) #鍦ㄨ umbrella reading 屽簲 Xuan ュ姞 鍏 ヤ bluff.
        html = response.read().decode('utf-8')
        print html
        soup = BeautifulSoup(html)
        lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})
        
        for li in lis:
            data_id=li.get('data-id')
            name=li.text
            curr.execute('select id from classify_new where name=%s',(name))
            y= curr.fetchone()
            if not y:
                curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))
        conn.commit()
    except Exception as e:
        print "get topic error",e
        

def get_extension(name):  
    where=name.rfind('.')
    if where!=-1:
        return name[where:len(name)]
    return None


def which_platform():
    sys_str = platform.system()
    return sys_str

def GetDateString():
    when=time.strftime('%Y-%m-%d',time.localtime(time.time()))
    foldername = str(when)
    return foldername

def makeDateFolder(par,classify):
    try:
        if os.path.isdir(par):
            newFolderName=par + '//' + GetDateString() + '//'  +str(classify)
            if which_platform()=="Linux":
                newFolderName=par + '/' + GetDateString() + "/" +str(classify)
            if not os.path.isdir( newFolderName ):
                os.makedirs( newFolderName )
            return newFolderName
        else:
            return None
    except Exception,e:
        print "kk",e
    return None

def download_img(url,classify):
    try:
        extention=get_extension(url)
        if(extention is None):
            return None
        req = urllib2.Request(url)
        resp = urllib2.urlopen(req,None,3)
        dataimg=resp.read()
        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
        top="E://topic_pic"
        folder=makeDateFolder(top, classify)
        filename=None
        if folder is not None:
            filename  =folder+"//"+name
        try:
            if "e82bab09c_m" in str(url):
                return True
            if not os.path.exists(filename):
                file_object = open(filename,'w+b')
                file_object.write(dataimg)
                file_object.close()
                return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name
            else:
                print "file exist"
                return None
        except IOError,e1:
            print "e1=",e1
            pass
    except Exception as e:
        print "eee",e
        pass
    return None #If it is not downloaded, use the link of the original website

def getChildren(node,name):
    global queue,nodeSet
    try:
        url="https://www.zhihu.com/topic/"+str(node)+"/hot"
        html=get_html(url)
        if html is None:
            return
        soup = BeautifulSoup(html)
        p_ch='parent topic'
        node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
        topic_cla=soup.find('div', {'class' : 'child-topic'})
        if topic_cla is not None:
            try:
                p_ch=str(topic_cla.text)
                aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #Get all child nodes
                if u'subtopic' in p_ch:
                    for a in aList:
                        token=a.get('data-token')
                        a=str(a).replace('\n','').replace('\t','').replace('\r','')
                        start=str(a).find('>')
                        end=str(a).rfind('</a>')
                        new_node=str(str(a)[start+1:end])
                        curr.execute('select id from rooms where name=%s',(new_node)) #Make sure the names are never the same
                        y= curr.fetchone()
                        if not y:
                            print "y=",y,"new_node=",new_node,"token=",token
                            queue.put((token,new_node,node_name))
            except Exception as e:
                print "add queue error",e
    except Exception as e:
        print "get html error",e
        
    

def getContent(n,name,p,top_id):
    try:
        global counter
        curr.execute('select id from rooms where name=%s',(name)) #Make sure the names are never the same
        y= curr.fetchone()
        print "exist?? ",y,"n=",n
        if not y:
            url="https://www.zhihu.com/topic/"+str(n)+"/hot"
            html=get_html(url)
            if html is None:
                return
            soup = BeautifulSoup(html)
            title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
            pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')
            description=soup.find('div',{'class':'zm-editable-content'})
            if description is not None:
                description=description.text
                
            if (u"uncategorized" in title or u"root topic" in title): #Allow storage to avoid infinite loop
                description=None
                
            tag_path=download_img(pic_path,top_id)
            print "tag_path=",tag_path
            if (tag_path is not None) or tag_path==True:
                if tag_path==True:
                    tag_path=None
                father_id=2 #The default is chat
                curr.execute('select id from rooms where name=%s',(p))
                results = curr.fetchall()
                for r in results:
                    father_id=r[0]
                name=title
                curr.execute('select id from rooms where name=%s',(name)) #Make sure the names are never the same
                y= curr.fetchone()
                print "store see..",y
                if not y:
                    friends_num=0
                    temp = time.time()
                    x = time.localtime(float(temp))
                    create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
                    create_time
                    creater_id=None
                    room_avatar=tag_path
                    is_pass=1
                    has_index=0
                    reason_id=None  
                    #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
                    #######################Content eligible for storage
                    counter=counter+1
                    curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
                    conn.commit() #Must enter the database from time to time, otherwise the parent node cannot be found
                    if counter % 200==0:
                        print "current node",name,"num",counter
    except Exception as e:
        print "get content error",e       

def work():
    global queue
    curr.execute('select id,node,parent,name from classify where status=1')
    results = curr.fetchall()
    for r in results:
        top_id=r[0]
        node=r[1]
        parent=r[2]
        name=r[3]
        try:
            queue.put((node,name,parent)) #First put into the queue
            while queue.qsize() >0:
                n,p=queue.get() #Top node dequeue
                getContent(n,p,top_id)
                getChildren(n,name) #Children of dequeuing content
            conn.commit()
        except Exception as e:
            print "what's wrong",e  
            
def new_work():
    global queue
    curr.execute('select id,data_id,name from classify_new_copy where status=1')
    results = curr.fetchall()
    for r in results:
        top_id=r[0]
        data_id=r[1]
        name=r[2]
        try:
            get_topis(data_id,name,top_id)
        except:
            pass


def get_topis(data_id,name,top_id):
    global queue
    url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'
    isGet = True;
    offset = -20;
    data_id=str(data_id)
    while isGet:
        offset = offset + 20
        values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}
        try:
            msg=None
            try:
                data = urllib.urlencode(values)
                request = urllib2.Request(url,data,headers)
                response = urllib2.urlopen(request,None,5)
                html=response.read().decode('utf-8')
                json_str = json.loads(html)
                ms=json_str['msg']
                if len(ms) <5:
                    break
                msg=ms[0]
            except Exception as e:
                print "eeeee",e
            #print msg
            if msg is not None:
                soup = BeautifulSoup(str(msg))
                blks = soup.find_all('div', {'class' : 'blk'})
                for blk in blks:
                    page=blk.find('a').get('href')
                    if page is not None:
                        node=page.replace("/topic/","") # put more seeds into the database
                        parent=name
                        ne=blk.find('strong').text
                        try:
                            queue.put((node,ne,parent)) #First put into the queue
                            while queue.qsize() >0:
                                n,name,p=queue.get() #Top node dequeue
                                size=queue.qsize()
                                if size > 0:
                                    print size
                                getContent(n,name,p,top_id)
                                getChildren(n,name) #Children of dequeuing content
                            conn.commit()
                        except Exception as e:
                            print "what's wrong",e  
        except urllib2.URLError, e:
            print "error is",e
            pass
            
        
if __name__ == '__main__':
    i=0
    while i<400:
        new_work()
        i=i+1

 Talking about the database problem, I will not upload the attachments here. I will create the fields myself, because this is really too simple. I use mysql, you can build it according to your own needs.

If you don't know anything, please go to the carousel.com to find me, because this is also developed by me.

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326048962&siteId=291194637