话不多说,直接上代码,
import csv, requests, re
from bs4 import BeautifulSoup
from lxml import etree
url = 'https://www.v2ex.com/?tab=all'
'''
#soup加正则
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
articles = []
for article in soup.find_all(class_='cell item'):
title = article.find(class_='item_title').get_text()
category = article.find(class_='node').get_text()
author = re.findall(r'(?<=<a href="/member/).+(?="><img)', str(article))[0]
#print(author)
u = article.select('.item_title > a')
#print(u)
link = 'https://www.v2ex.com' + re.findall(r'(?<=href=").+(?=")', str(u))[0]
articles.append([title, category, author, link])
print(articles)
'''
#xpath 写
response=requests.get(url).text
html=etree.HTML(response)
#print(html)
tag_div=html.xpath('//div[@class="box"]/div[@class="cell item"]')
#print(tag_div)
articles=[]
for each in tag_div:
title=each.xpath('./table//tr/td[3]/span[1]/a/text()')[0]
href=each.xpath('./table//tr/td[3]/span[1]/a/@href')
#print(href)
urlhref=[url+ i for i in href]
#print(urlhref)
category=each.xpath('./table//tr/td[3]/span[2]/a/text()')[0]
#print(category)
author=each.xpath('./table//tr/td[3]/span[2]/strong[1]//text()')[0]
#print(title,author,category,''.join(urlhref))
articles.append([title,author,category,''.join(urlhref)])
print(articles)
with open('v2ex3.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(['文章标题', '分类', '作者', '文章地址'])
for row in articles:
writer.writerow(row)