# -*- coding: utf-8 -*-
# 获取相关搜索内容
from threading import Thread
from queue import Queue
import requests
from lxml import etree
import cssselect
class Baiduxg(Thread):
def __init__(self,link,queue):
super().__init__()
self.queue = queue
self.link = link
self.headers = {
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
"User-Agent": "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
}
def run(self):
while True:
try:
link = self.queue.get()
html = self.parse_html(link)
self.get_text(html)
finally:
self.queue.task_done()
@staticmethod
def get_text(html):
# print(html)
doc = etree.HTML(html) #获取文档树
# print(doc)
relate_search = doc.xpath('//div[@id="rs"]/div/table/tr/th/a/text()')
print(relate_search)
# rs > div > table > tbody > tr:nth-child(1) > th:nth-child(3) > a
# tt = doc.cssselect('#rs > div > table > tr > th > a ')
# for t in tt:
# print(t.text)
def parse_html(self,links):
try:
r = requests.get(links, headers = self.headers, timeout = 3)
except requests.RequestException as err :
html = None
print(err)
else:
r.encoding = 'utf-8'
html = r.text
return html
if __name__ == '__main__':
query = Queue()
url = "http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=seo&fenlei=256&oq=seo&rsv_pq=f910f0f3000fe857&rsv_t=acc3RUQCtu%2FSTxgtcqyM53NKwxbU6Yjah7Ar4s4JFsZQDXG5Tg0PCrpAozI&rqlang=cn&rsv_dl=tb&rsv_enter=0&rsv_btype=t"
query.put(url)
xg = Baiduxg(url,query)
xg.daemon = True
xg.start()
query.join()
抓取百度相关词 - 简化版
猜你喜欢
转载自blog.csdn.net/haohaomax1/article/details/111312644
今日推荐
周排行