python_celery redis的读取

1、execute_tasks.py # 读取任务队列存储进redis

import requests
import re
from lxml import etree
from aqicn import crawl

def get_cities():
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'}
    content = requests.get('http://aqicn.org/city/all/cn/',headers=headers)
    response = content.content.decode('utf-8')

    hongkong = re.compile(r'id=\'香港\'></a>(.*)<a id=\'蒙古\'></a>',re.S)
    taiwan = re.compile(r'id=\'台湾\'></a>(.*)<a id=\'新加坡\'></a>',re.S)
    land = re.compile(r'id=\'中国\'></a>(.*)<a id=\'越南\'></a>',re.S)

    hongkong = hongkong.findall(response)[0]
    hongkong = etree.HTML(hongkong)

    taiwan = taiwan.findall(response)[0]
    taiwan = etree.HTML(taiwan)

    land = land.findall(response)[0]
    land = etree.HTML(land)

    city_list = land.xpath('//a') + hongkong.xpath('//a') + taiwan.xpath('//a')
    for i in city_list:
        try:
            url = i.xpath('.//@href')[0]
            name = i.xpath('.//text()')[0].strip()
            if name:
                yield {'url':url,'city':name}
        except Exception as e:
            print(e)

def task_manager():
    for data in get_cities():
        crawl.delay(data)

if __name__ == '__main__':
    task_manager()

2、aqicn.py # 读取队列进行任务解析并存储

import requests
from lxml import etree
from celery import Celery

# 这里定义了broker和backend
# 注意IP和后面的数字都是可以调整的
app = Celery('aqicn', broker='redis://192.168.4.53:6379/1', backend='redis://192.168.4.53:6379/3')

# 装饰器，说明这是一个task的实现
@app.task
def crawl(data):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Host': 'aqicn.org',
        'Proxy-Connection': 'keep-alive',
        'Referer': 'http://aqicn.org/city/all/cn/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
    url = data['url']
    response = requests.get(url, headers=headers)
    html = response.content.decode('utf-8')
    html = etree.HTML(html)
    try:
        aqi = html.xpath('//div[@id="aqiwgtvalue"]//text()')[0] # 空气质量
    except:
        news = html.xpath('//div[@class="section-content"]/center//h3//text()')
        if news:
            aqi= ''.join(news)
        else:
            aqi = '页面加载错误'
    data['aqi'] = aqi
    return data

3、redis_read.py # 从redis数据库中取值并删除原值，且一旦有新值传入，继续读取并删除，做到持久化，可以直接展示，或者存到mysql数据中

import redis, pickle

rediscli = redis.StrictRedis(host='192.168.4.53', db=3, port=6379)

while True:
    result = rediscli.keys() # 持续从redis里拿keys
    for i in result:
        key = i.decode('utf-8')
        data = rediscli.get(key)
        try:print(pickle.loads(data))
            rediscli.delete(key)
        except Exception as e:
            print(e)

python_celery redis的读取

猜你喜欢