私募基金公示列表页:
网址:http://gs.amac.org.cn/amac-infodisc/res/pof/fund/index.html
打开网页,右键检查,查看network中的url,找出页面url变化的规律:
发现就是page这个参数的变化,ok
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 27 11:21:11 2018
@author: Belinda
"""
from lxml import etree
import requests
import csv
import time
from multiprocessing import *
def spider():
headers = {'User-Agent': 'Mozilla/5.0 (Window NT 10.0; WOW64)\ AppleWebKit/537.36 (KTML,like Gecko) Chrome/46.0.2490.80 Safari/537.36'}
for i in range(0,4):
url='http://gs.amac.org.cn/amac-infodisc/api/pof/fund?rand=0.49229080398526315&page={}&size=20'.format(i)
html=requests.get(url,headers=headers)
time.sleep(1)
#用获取的页面初始化etree,得到一个selector
#然后用selector使用xpath提取数据
selector=etree.HTML(html.text)
#先获取基金列表,查看每一行数据的xpath,提取相同部分作为simu_list的xapath
simu_list=selector.xpath('//*[@id="fundlist"]/tbody/tr[1]')
for simu in simu_list:
id=''.join(simu.xpath('td[1]/text()'))
fundName=''.join(simu.xpath('td[2]/a/text()'))
managerName=''.join(simu.xpath('td[3]/a/text()'))
mandatorName=''.join(simu.xpath('td[4]/text()'))
establishDate=''.join(simu.xpath('td[5]/text()'))
recordTime=''.join(simu.xpath('td[6]/text()'))
item=(id,fundName,managerName,mandatorName,establishDate,recordTime)
print(item)
writer.writerow(item)
if __name__=='__main__':
fp=open("./simuwang.csv",'a+',encoding="utf-8",newline="")
writer=csv.writer(fp)
writer.writerow(('id', 'fundName','managerName','mandatorName', 'establishDate','recordTime'))#csv文件的每列的列表名
#pool=Pool(4)
#pool=mutiprocessing.Pool(4)
#pool.map(spider())
spider()
fp.close()
print("爬取结束!")
遇到一个问题,尚未解决