第一个爬虫脚本,只爬取了一个标题,一张图片,和一个标签,内容如下:
from bs4 import BeautifulSoup
import requests
def paChong(n):
if not isinstance(n,int):
print('Please input "int" ,like 1 or 2 .')
exit
n_page = n
page_num = 1
f = open('123.txt', 'a',encoding='utf-8')
for i in range(1,n_page+1):
pk = 1
urls = 'https://www.baomi.net/page/%d' % i
f.write( " -------- " + urls + " --------\n\n")
def re_img(a):
index1 = a.rfind('src=')
index2 = a.rfind('&h')
return a[index1+4:index2]
wb_data = requests.get(urls)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('#page-content > div > div > div > div > div.ajax-load-box.posts-con > div > div > div.posts-default-box > div.posts-default-title > h2 > a')
imgs = soup.select('#page-content > div > div > div > div > div > div > div > div > a > img')
labels = soup.select('#page-content > div > div > div > div > div > div > div > div.posts-default-box > div.posts-default-content > div.posts-text')
for title,img,label in zip(titles,imgs,labels):
data = {
'title':title.get('title'),
'img':re_img(img.get('src')),
'label':label.get_text()[0:-8]
}
f.write("--" + str(page_num) + '-' + str(pk) + "--\n")
f.write(data['title']+'\n')
f.write(data['img']+'\n')
f.write(data['label'])
f.write('\n\n\n\n')
pk += 1
page_num += 1
f.close()
paChong(10)