爬虫新一例,爬取新闻

作为新闻网站,靠广告维持运转也是很艰难的.
大家尽量通过正常方式浏览新闻.

此段代码仅供学习之用,不作商业用途.
转载请注明出处.

代码:
Python环境 : Python 3.6

import requests
from lxml import etree

urls = []


def get_urls(page_num):
    global urls
    headers = {
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    for num in range(1, page_num+1):
        url = 'https://m.cnbeta.com/wap/index.htm?page=' + str(num)
        data_list = requests.get(url,headers=headers )
        data_list.encoding = 'utf-8'
        data_html = etree.HTML(data_list.text)
        data_urls = data_html.xpath('//div[@id="info_list"]/div[@class="list"]/a//@href')
        # data_title = data_html.xpath('//div[@id="info_list"]/div[@class="list"]/a//text()')
        urls += data_urls
    print(urls)


def write_data(title,content):
    with open('cnbeta.txt','a',encoding='utf-8') as f:
        f.write('>>>  ' + title + '  <<<\n\n')
        f.write(content + '\n\n--------------------------\n--------------------------\n\n\n')


def get_articles(urls):
    headers = {
        'Referer': 'https://m.cnbeta.com/wap/index.htm',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

    for url in urls:
        new_url = 'https://m.cnbeta.com' + url
        response = requests.get(new_url,headers=headers)
        response.encoding = "utf-8"
        response_html = etree.HTML(response.text)
        title = response_html.xpath('//div[@class="title"]/b//text()')
        print(title)
        content = response_html.xpath('//div[@class="content"]/p//text()')
        content_all = ''
        for content_x in content:
            content_all = content_all + "\n" + content_x
        write_data(title[0], content_all)


page_num = int(input("请输入您想得到几页的数据: "))
if __name__ == '__main__':
    get_urls(page_num)
    get_articles(urls)

猜你喜欢

转载自blog.csdn.net/abcdasdff/article/details/82114887
今日推荐