xpath和beautifulsoup爬取网页的demo

这里面beautifulsoup和xpath都有用到,还有csv保存和excel保存的方式都有,写入到excel使用的是openpyxl。

首先说我们这次看的这个网页是这么个网页 http://www.allitebooks.org/

import requests
import csv
from wutils import defNum
from bs4 import BeautifulSoup
from lxml import etree
import openpyxl
def resolver_page(data):
    books=[]
    soup=BeautifulSoup(data,'lxml')
    xpath_data = etree.HTML(data)
    titles=soup.select('h2[class="entry-title"] a[rel="bookmark"]')
    for title in titles:
        book=[]
        book.append(title.text)
        book.append(title.attrs.get('href'))
        books.append(book)
    synopsis = xpath_data.xpath('//h5[@class="entry-author"]')
    for synopsi,book in zip(synopsis,books):
        author = synopsi.xpath('.//a[@rel="tag"]/text()')
        book.append(author)
    content=xpath_data.xpath('//div[@class="entry-summary"]/p/text()')
    #//a/@href   这个取属性值
    for content,book in zip(content,books):
        book.append(content)
    return books
if __name__ == '__main__':
    front_url='http://www.allitebooks.org/page/'
    head_list={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
    #fp=open('allbook.csv','w',encoding='utf-8')
    ***

## 还有这里啊,我试了很多次,就得先保存下来,你再操作,你才可以进行操作这个excel,还有一点就是,要是不这么写就会有报错说是cell不是已知的方法

***
    # wb=openpyxl.Workbook("books.xlsx")
    # wb.save('books.xlsx')
    wb=openpyxl.load_workbook('books.xlsx')
    ws=wb.create_sheet('book',1)#这个使用的是我们自己创建的一个工作簿
    #ws=wb.active#这个出来的表使默认的工作簿,
    #writer=csv.writer(fp)
    #writer.writerow(['书名','链接','作者','简介'])
    #ws.cell(row=1,column=1,value='书名')
    ws.cell(1,1).value='书名'
    ws.cell(1,2).value='链接'
    ws.cell(1,3).value ='作者'
    ws.cell(1,5).value ='简介'
    results=[]
    for index in range(1,2):
          data=requests.get(url=front_url+str(index)+str('/'),headers=defNum.defNum().login_heards).content.decode()
          result=resolver_page(data)
          #writer.writerows(result)
          #print(index)
          results.extend(result)
    # fp.close()
    for ind in range(len(results)):
        ws.cell(ind+2,1).value=results[ind][0]
        ws.cell(ind+2,2).value=results[ind][1]
        for re in range(len(results[ind][2])):
             ws.cell(ind+2,3+re).value=results[ind][2][re]
        ws.cell(ind+2,5).value=results[ind][3]
    wb.save('books.xlsx')
发布了7 篇原创文章 · 获赞 3 · 访问量 434

猜你喜欢

转载自blog.csdn.net/baidu_40492134/article/details/103717101