爬取携程攻略社区所有笔记保存到txt文件

这是一个朋友做的旅游方面的分析,需要游客的所有笔记的文本,并且保存在一个txt里
源码如下:

import requests
from lxml import etree
from bs4 import BeautifulSoup

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}

for num in range(1,81,1):
    print(num)
    url = 'https://you.ctrip.com/searchsite/travels/?query=%e9%81%bf%e6%9a%91%e6%97%85%e6%b8%b8&isAnswered=&isRecommended=&publishDate=365&PageNo='+str(num)
    html = requests.get(url).text
    txt = etree.HTML(html)
    file = txt.xpath('/html/body/div[2]/div[2]/div[2]/div/div[1]/ul/li')
    for t in file:
        href = t.xpath('./dl/dt/a/@href')[0]
        hrefUrl = 'https://you.ctrip.com'+href
        print (hrefUrl)
        html = requests.get(url = hrefUrl,headers=headers).text
        soup = BeautifulSoup(html, "html.parser")
        t = soup.find(attrs={"class": "ctd_content"})
        txt = t.get_text().replace("\n","")
        txt = str(txt)
        filename = 'G:write_data.txt'
        with open(filename,'a',encoding='utf-8') as f: # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据!
            f.write(txt)
            f.close()
print ("获取完毕!")

爬取结果:
在这里插入图片描述
词云分析:

# coding: utf-8
 
from wordcloud import WordCloud
import cv2
import jieba
 
with open(r'G:\write_data.txt','r',encoding='utf-8') as f:
    text = f.read()
cut_text =" ".join(jieba.cut(text))
 
color_mask = cv2.imread('G:1234.jpg')
 
cloud = WordCloud(
       #设置字体,不指定就会出现乱码
       font_path=" C:\\Windows\\Fonts\\STXINGKA.TTF",
       #font_path=path.join(d,'simsun.ttc'),
       #设置背景色
       background_color='white',
       #词云形状
       mask=color_mask,
       #允许最大词汇
       max_words=2000,
       #最大号字体
       max_font_size=40
   )
 
wCloud = cloud.generate(cut_text)
wCloud.to_file('cloud.jpg')
 
import matplotlib.pyplot as plt
plt.imshow(wCloud, interpolation='bilinear')
plt.axis('off')
plt.show()

词云结果:
在这里插入图片描述

发布了62 篇原创文章 · 获赞 25 · 访问量 9314

猜你喜欢

转载自blog.csdn.net/ayouleyang/article/details/102539308