- 这个网站没有反扒,使用的是Python 自带urllib ,解析库数xpath,生成器,数据保存到mongodb。
- 代码
from urllib import request
from pymongo import MongoClient
from lxml import etree
clinet = MongoClient(host='localhost', port=27017)
class Requesterror(Exception):
"""自定义异常"""
pass
class spider_joke(object):
def __init__(self):
self.url = "http://www.budejie.com/2"
self.headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
def get_request(self, url, headers):
"""发送请求"""
try:
response = request.urlopen(request.Request(url, headers=headers))
if not response:
raise Requesterror("Request error,Please check the request parameters")
return response.read().decode()
except Exception as e:
print(e)
def get_parse(self, text):
"""解析网页"""
htm = etree.HTML(text)
li_list = htm.xpath("//div[@class='j-r-list']/ul/li")
for li in li_list:
item = dict()
item["user_name"] = li.xpath(".//div[@class='u-txt']/a/text()")[0] if li.xpath(
".//div[@class='u-txt']/a/text()") else None
item["pubDate"] = li.xpath(".//div[@class='u-txt']/span/text()")[0] if li.xpath(
".//div[@class='u-txt']/span/text()") else None
item["pic_url"] = li.xpath(".//div[@class='j-r-list-c-img']//img/@src")[0] if li.xpath(
".//div[@class='j-r-list-c-img']//img/@src") else None
item["praise_count"] = li.xpath(".//li[@class='j-r-list-tool-l-up']/span/text()")[0] if li.xpath(
".//li[@class='j-r-list-tool-l-up']/span/text()") else None
item["comment-counts"] = li.xpath(".//li[contains(@class,'f-tac')]//a/span/text()")[0] if li.xpath(
".//li[contains(@class,'f-tac')]//a/span/text()") else None
item["cotent"] = li.xpath(".//div[@class='j-r-list-c-desc']/a/text()")[0] if li.xpath(
".//div[@class='j-r-list-c-desc']/a/text()") else None
yield item
def save_data(self, data):
"""保存到mongodb"""
try:
col = clinet["test"]["joke"]
col.insert_one(data)
except Exception as e:
print(e)
def main(self):
"""主函数"""
try:
text = self.get_request(self.url, self.headers)
for item in self.get_parse(text):
self.save_data(item)
except Exception as e:
print(e)
if __name__ == '__main__':
spider_joke().main()