python爬虫 - 爬取励志语录

将爬取到的网页写入文件中
import urllib.request

url = 'http://www.baidu.com'

response = urllib.request.urlopen(url=url)

print(response)

#print(response.read().decode())

with open('baidu.html','w',encoding='utf8') as fp:
    fp.write(response.read().decode())

爬取励志语录
import urllib.request
import urllib.parse
import re
import os
import time
'''
start_page = int(input("请输入开始页面:"))
end_page = int(input("请输入结束页面:"))

for page in range(start_page ,end_page+1):
    print("开始下载%s页......"%page)
    url = 'http://www.yikexun.cn/lizhi/qianming/list_50_'+str(page)+'.html'
    header = {'User-Agent' : ' Mozilla/5.0 (Windows NT 6.1; Win64;'
                             ' x64) AppleWebKit/537.36 (KHTML, like'
                             ' Gecko) Chrome/71.0.3578.98 Safari/537.36',}
    request = urllib.request.Request(url=url,headers= header)
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf8')
    with open('F1.html', 'w', encoding='utf8') as fp:
        fp.write(content)
    print("结束下载%s页" % page)

    pattern = re.compile(r'<b>(.*?)</b></a></h3>.*?<p>(.*?)</p>',re.S)
    ret = pattern.findall(content)
    #print(ret)
    for text_info in ret:
        text_title = text_info[0]
        text_main = text_info[1]
        dirName = 'lizhi'
        if not os.path.exists(dirName):
            os.mkdir(dirName)
        fileName = str(text_title).split('——')[0]
        filePath = os.path.join(dirName, fileName)
        with open(filePath+ '.txt', 'w',encoding = 'utf8') as fp:
            fp.write(text_main)

'''
# 将上述过程封装成函数
def handle_request(url, page):
    url += str(page) + '.html'
    header = {'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; Win64;'
                            ' x64) AppleWebKit/537.36 (KHTML, like'
                            ' Gecko) Chrome/71.0.3578.98 Safari/537.36', }
    request = urllib.request.Request(url=url, headers=header)
    return request

def parse_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf8')
    with open('F1.html', 'w', encoding='utf8') as fp:
        fp.write(content)
    pattern = re.compile(r'<b>(.*?)</b></a></h3>.*?<p>(.*?)</p>', re.S)
    ret = pattern.findall(content)
    writeIn(ret)
def writeIn(ret):
    for text_info in ret:
        text_title = text_info[0]
        text_main = text_info[1]
        dirName = 'lizhi'
        if not os.path.exists(dirName):
            os.mkdir(dirName)
        fileName = str(text_title)
        filePath = os.path.join(dirName, fileName)
        print("%s start writing......"%fileName)
        with open(filePath+ '.txt', 'w',encoding = 'utf8') as fp:
            fp.write(text_main)
        print("%s write successfully" % fileName)
        #time.sleep()

def main():
    url = 'http://www.yikexun.cn/lizhi/qianming/list_50_'
    start_page = int(input("请输入开始页面:"))
    end_page = int(input("请输入结束页面:"))

    for page in range(start_page, end_page + 1):
        print("start downloading %s页......" % page)
        request = handle_request(url, page)
        parse_content(request)
        print("第%s页 end download" % page)
        time.sleep(1)

if __name__ == '__main__':
    main()

发布了51 篇原创文章 · 获赞 29 · 访问量 2386

猜你喜欢

转载自blog.csdn.net/fangweijiex/article/details/103745285
今日推荐