数据科学与python语言实验——NumPy数值计算基础

NumPy数值计算基础

实验数据：
链接：https://pan.baidu.com/s/1-E2ShVTdI0X5lwDtMLFFsQ
提取码：0929

代码实现：

之前不会的地方：

1．读取文件
使用numpy内置的loadtxt()函数以及这个函数的参数frame：要读取的文件，dtype读取后的数据类型，delimiter：读取文件中的数据分隔符

 #参数列表：fname要读取的文件，dtype读取后的数据类型，delimiter读取文件中数据的分隔符
        self.data=np.loadtxt(fname=path,dtype=str,delimiter=',')

2．数据的切分索引

self.data=self.data[1:,:]#去掉第一行（标签行）
self.colindex=self.data[0,:]#得到标签行

3．将数据中的年份季度带小数的浮点数转化为不带浮点数的整数使用了numpy中的np.char.replace()函数

 #需要将数据中的年份和季度中的小数部分去掉
self.data[:,:2]=np.char.replace(self.data[:,:2],'.0','')
#data[:,:2]是数据中的前两列

4.查找满足条件的行索引

 index = np.where((self.data[:, 0] == year) & (self.data[:, 1] == quarter))  
 # 使用where方法返回符合给定年份和季度的行索引

5.实现变量的展平

import numpy as np


fp='./macrodata.csv'#要读取的文件
op='./test.csv'#输出文件

class processdata:
    colmap = {
    
    'year': 0, 'quarter': 1, 'gdp': 2, 'realcons': 3, 'realinv': 4, 'realgovt': 5, 'realdpi': 6, 'cpi': 7, \
              'm1': 8, 'tbilrate': 9, 'unemp': 10, 'pop': 11, 'infl': 12, 'realint': 13}  # 定义一个字典，用来转换字符串列名与列索引
    def __init__(self,path):#定义构造函数
        #参数列表：fname要读取的文件，dtype读取后的数据类型，delimiter读取文件中数据的分隔符
        self.data=np.loadtxt(fname=path,dtype=str,delimiter=',')
        self.data=self.data[1:,:]#去掉第一行（标签行）
        self.colindex=self.data[0,:]#得到标签行
        #需要将数据中的年份和季度中的小数部分去掉
        self.data[:,:2]=np.char.replace(self.data[:,:2],'.0','')#data[:,:2]是数据中的前两列
    def lookupdata(self,year,quarter,col):#查看任意时间点的GDP或者人口信息 col传入gdp或pop
        year = str(int(year))
        quarter = str(int(quarter))
        index = np.where((self.data[:, 0] == year) & (self.data[:, 1] == quarter))  # 使用where方法返回符合给定年份和季度的行索引
        index = np.array(index)
        result = 'The ' + col + ' in quarter ' + str(quarter) + ', year ' + str(year) + ' is '

        if index.size == 0:  # 若没有查找到符合条件的行
            tmp = np.where(self.data[:, 0] == year)
            tmp = np.array(tmp)
            if tmp.size == 0:  # 若没有找到该年份的行
                error = 'The given year ' + str(year) + ' is out of range...'
            else:  # 找到年份但没找到对应的季度
                error = 'The given quarter ' + str(quarter) + ' is not found for the given year ' + str(year) + '...'
            print(error)
            return error
        else:  # 找到了符合条件的结果
            print(result + str(self.data[index[0][0]][processdata.colmap[col]]) + '...')
            return self.data[index[0][0]][processdata.colmap[col]]

    def calPerCapitaGDP(self, year, quarter):  # 计算人均GDP
        year = str(int(year))
        quarter = str(int(quarter))
        index = np.where((self.data[:, 0] == year) & (self.data[:, 1] == quarter))  # 使用where方法返回符合给定年份和季度的行索引
        index = np.array(index)
        result = 'The Per Capita GDP in quarter ' + str(quarter) + ', year ' + str(year) + ' is '
        if index.size == 0:  # 若没有查找到符合条件的行
            tmp = np.where(self.data[:, 0] == year)
            tmp = np.array(tmp)
            if tmp.size == 0:  # 若没有找到该年份的行
                error = 'The given year ' + str(year) + ' is out of range...'
            else:  # 找到年份但没找到对应的季度
                error = 'The given quarter ' + str(quarter) + ' is not found for the given year ' + str(year) + '...'
            print(error)
            return error
        print(
            result + str(float(self.data[index[0][0]][2]) / float(self.data[index[0][0]][11])) + '...')  # 第二列数据/第11列数据
        return float(self.data[index[0][0]][2]) / float(self.data[index[0][0]][11])

    def flattendata(self, col=['gdp', 'pop']):  # 展平方法

        collist = [processdata.colmap[i] for i in col]  # 将要展平的列的列名转换成对应的数值索引
        data = self.data[:, collist]  # 提取这些列的数据
        colarr = np.zeros((self.data.shape[0], 1), dtype=np.float)
        indexdata = self.data[:, [0, 1]]  # 提取年份和季度信息
        indexdata = indexdata.repeat(len(col), axis=0)  # 对年份和季度两列进行纵向重复，重复的次数为要展平的列数，repeat是对所有行整个进行重复
        newdata = data.flatten().reshape(-1, 1)  # 使用flatten方法将要展平的列展平，并转为列向量

        newcol = collist * self.data.shape[0]  # 将要展平的列的列号整个进行横向重复（与repeat重复方式对应），重复次数为原始数据的行数
        newcol = np.array(newcol).reshape(-1, 1)
        tmp = np.hstack((indexdata, newdata))  # 将重复后的日期季度列与展平后的列向量进行横向合并
        tmp = np.hstack((tmp, newcol))  # 再与其原数据所在列号生成的列进行横向合并
        newcolindex = np.array(['year', 'quarter', 'values', 'columns number'])
        tmp = np.vstack((newcolindex, tmp))  # 添加新的列名
        return tmp

    def printdata(self):
        data1 = np.vstack((self.data[:, 2].reshape(-1, 1), self.data[:, 11].reshape(-1, 1)))

        print(data1.shape)


prdata=processdata(fp)#定义了一个类对象来实现这些方法
print('(a):')
prdata.lookupdata(2000,1,'gdp')
print('(b):')
prdata.lookupdata(2000,1,'pop')
print('(c):')
prdata.lookupdata(2020,1,'gdp')
print('(d):')
prdata.lookupdata(2000,6,'pop')
print('(e):')
prdata.calPerCapitaGDP(2000,1)
print('(f):')
flatdata=prdata.flattendata()

np.savetxt(op,flatdata,delimiter=',',fmt='%s')#保存展平后的结果

print('End!')
input('按回车键结束')

数据科学与python语言实验——NumPy数值计算基础

NumPy数值计算基础

猜你喜欢