抓取百度相关词 - 简化版

# -*- coding: utf-8 -*-
# 获取相关搜索内容
from threading import Thread
from queue import Queue
import requests
from lxml import etree
import cssselect

class Baiduxg(Thread):
    def __init__(self,link,queue):
        super().__init__()
        self.queue = queue
        self.link    = link
        self.headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            "User-Agent": "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
        }

    def run(self):
        while True:
            try:
                link = self.queue.get()
                html = self.parse_html(link)
                self.get_text(html)
            finally:
                self.queue.task_done()



    @staticmethod
    def get_text(html):
        # print(html)
        doc = etree.HTML(html)  #获取文档树
        # print(doc)
        relate_search = doc.xpath('//div[@id="rs"]/div/table/tr/th/a/text()')
        print(relate_search)

        # rs > div > table > tbody > tr:nth-child(1) > th:nth-child(3) > a
        # tt  = doc.cssselect('#rs > div > table > tr > th > a ')
        # for t in tt:
        #     print(t.text)

    def parse_html(self,links):
        try:
            r = requests.get(links, headers = self.headers, timeout = 3)
        except requests.RequestException as err :
            html = None
            print(err)
        else:
            r.encoding = 'utf-8'
            html = r.text
        return html

if __name__ == '__main__':
    query = Queue()
    url = "http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=seo&fenlei=256&oq=seo&rsv_pq=f910f0f3000fe857&rsv_t=acc3RUQCtu%2FSTxgtcqyM53NKwxbU6Yjah7Ar4s4JFsZQDXG5Tg0PCrpAozI&rqlang=cn&rsv_dl=tb&rsv_enter=0&rsv_btype=t"
    query.put(url)
    xg = Baiduxg(url,query)
    xg.daemon = True
    xg.start()
    query.join()

猜你喜欢

转载自blog.csdn.net/haohaomax1/article/details/111312644