用beautifulsoup,re,xpath爬取文章并保存为csv文件

话不多说,直接上代码,

import csv, requests, re
from bs4 import BeautifulSoup
from lxml import etree

url = 'https://www.v2ex.com/?tab=all'
'''
#soup加正则
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
articles = []
for article in soup.find_all(class_='cell item'):
    title = article.find(class_='item_title').get_text()
    category = article.find(class_='node').get_text()
    author = re.findall(r'(?<=<a href="/member/).+(?="><img)', str(article))[0]
    #print(author)
    u = article.select('.item_title > a')
    #print(u)
    link = 'https://www.v2ex.com' + re.findall(r'(?<=href=").+(?=")', str(u))[0]
    articles.append([title, category, author, link])
print(articles)
'''

#xpath 写
response=requests.get(url).text
html=etree.HTML(response)
#print(html)

tag_div=html.xpath('//div[@class="box"]/div[@class="cell item"]')
#print(tag_div)

articles=[]
for each in tag_div:
    title=each.xpath('./table//tr/td[3]/span[1]/a/text()')[0]
    href=each.xpath('./table//tr/td[3]/span[1]/a/@href')
    #print(href)
    urlhref=[url+ i for i in href]
    #print(urlhref)
    category=each.xpath('./table//tr/td[3]/span[2]/a/text()')[0]
    #print(category)
    author=each.xpath('./table//tr/td[3]/span[2]/strong[1]//text()')[0]
    #print(title,author,category,''.join(urlhref))
    articles.append([title,author,category,''.join(urlhref)])
print(articles)

with open('v2ex3.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['文章标题', '分类', '作者', '文章地址'])
    for row in articles:
        writer.writerow(row)

猜你喜欢

转载自blog.csdn.net/hellenlee22/article/details/89856812