利用亮汤爬虫易烊千玺在QQ音乐扑通社区的“阅读”、“动态”和“成员”数据。爬虫效果如下:
爬虫易烊千玺qq音乐数据
一、搭建框架
框架设计如下:
def main():
baseurl = "https://i.y.qq.com/n2/m/share/fans_community/putoo_group.html?mid=042aMUl42A0lb8&ADTAG=wxfshare"
#爬取网页
read_menber_Data = getData(baseurl)
#保存数据
savepath = "QQ扑通.xls"
saveData(read_menber_Data,savepath)
二、子程序和主函数
1、爬取网页
def getData(baseurl):
list = []
html = askURL(baseurl)
#逐一解析
soup = BeautifulSoup(html, "html.parser") # 声明解析器
for item in soup.find_all("p",class_="header_data__number"):#寻找符合要求的字符串形成列表
#item = str(item)
#print(item.text)
s = str(item.text)
list.append(s)
print(list)
return list
2、得到url信息
def askURL(url):
#将本地身份证亮出来告诉服务器,获取的信息的格式
head = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
#新建request接受信息
request = urllib.request.Request(url,headers=head)
html = ""
#将获取的网页信息存入html
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
3、保存
def saveData(datalist,savepath):
print("开始保存")
book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建work对象
sheet =book.add_sheet('QQ扑通',cell_overwrite_ok=True)#修改单元格权限 true
col = ("阅读","动态","成员")
for i in range(0,3): #写exceltitle
sheet.write(0,i,col[i])
i = 1
for j in range(0,3):
sheet.write(i,j,datalist[j]) #写入第i条的每一列
book.save(savepath)
4、运行主函数
if __name__ == "__main__":
main() #调用函数