百度贴吧爬虫

#encoding=utf-8
#目标网站：http://tieba.baidu.com/p/3522395718
#跟帖用户名，跟帖内容，跟帖时间
#涉及知识：Requests获取网页,xpath提取内容，map实现多线程爬虫

from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def spider(url):
    html = requests.get(url)
    selector = etree.HTML(html.text)
    content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "] ')
    # print content_field
    item = {}
    for each in content_field:
        reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot', ''))
        #json.loads()用于将str类型的数据转成dict
        # print reply_info
        # {'a': '1111', 'c': '3333', 'b': '2222', 'd': '4444'}< type         'dict' >
        # {"a": "1111", "c": "3333", "b": "2222", "d": "4444"} < type        'str' >
        # {u'a': u'1111', u'c': u'3333', u'b': u'2222', u'd': u'4444'} < type        'dict' >
        #转义字符转为空
        #原格式为json格式，解析成字典的格式
        author = reply_info['author']['user_name']
        content = each.xpath(
            'div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content  clearfix"]/text()')[0]
        reply_time = reply_info['content']['date']
        print content
        print reply_time
        print author
        item['user_name'] = author
        item['topic_reply_content'] = content
        item['topic_reply_time'] = reply_time
        # print item
        towrite(item)

def towrite(contentdict):
    f.writelines(u'回帖时间：'+str(contentdict['topic_reply_time'])+'\n')
    f.writelines(u'回帖内容：' + unicode(contentdict['topic_reply_content'])+ '\n')
    f.writelines(u'回帖人：' + str(contentdict['user_name'])+ '\n')

if __name__ == '__main__':
    pool = ThreadPool(8)
    f = open('content.txt', 'a')
    page = []
    for i in range(1, 21):
        newpage = 'http://tieba.baidu.com/p/3522395718?pn='+str(i)
        page.append(newpage)
    print page

    results = pool.map(spider, page)
    pool.close()
    pool.join()
    f.close()
猜你喜欢