HW1 PM2.5 代码逐行解析

import csv
import numpy as np
from numpy.linalg import inv
import random
import math
import sys

#读数据
data = []
# 每一个维度储存一种污染物的数据
for i in range(18):
	data.append([])

n_row = 0
text = open('C:/Users/liky/Desktop/PY/2017fall-ml-hw1-master/2017fall-ml-hw1-master/train.csv', 'r', encoding='big5')
row = csv.reader(text , delimiter=",")

for r in row:
    # 第0行沒有数据
    if n_row != 0:
        # 每一行只有第3-27格有值(1天內24小时的数值)
        for i in range(3,27):
            if r[i] != "NR":
                data[(n_row-1)%18].append(float(r[i]))
            else:
                data[(n_row-1)%18].append(float(0))
    n_row = n_row+1
text.close()

#解析数据到(x,y)
x = []
y = []
# 每12个月
for i in range(12):
    # 一個月取连续10小时的data可以有471组
    for j in range(471):
        x.append([])
        # 18种污染物
        for t in range(18):
            # 连续9小時
            for s in range(9):
                x[471*i+j].append(data[t][480*i+j+s] )   #一个x中的值为1到9时的值(18行9列)  对应的y的值为第10时的PM2.5的值
        y.append(data[9][480*i+j+9])

#创建x和y的数组
x = np.array(x)
y = np.array(y)


# add square term
# x = np.concatenate((x,x**2), axis=1)

# add bias
x = np.concatenate((np.ones((x.shape[0],1)),x), axis=1)  # bias

#初始参数
w = np.zeros(len(x[0]))
l_rate = 10
repeat = 10000


#check answer
# use close form to check whether ur gradient descent is good
# however, this cannot be used in hw1.sh
# w = np.matmul(np.matmul(inv(np.matmul(x.transpose(),x)),x.transpose()),y)

#开始训练
x_t = x.transpose()            #转置
s_gra = np.zeros(len(x[0]))

for i in range(2):
    hypo = np.dot(x,w)                      #矩阵相乘,得到一个行数和x相同,列数为1的矩阵
    loss = hypo - y                         # hypo = w1*x1 + w2*x2 + ... + wt*xt + bias
    cost = np.sum(loss**2) / len(x)         #...
    cost_a = math.sqrt(cost)                #求loss值,对结果无影响
    gra = np.dot(x_t,loss)                  # 求微分 loss = (hypo - y)   --->   &gra/&w1 = (hypo - y) * x1 = loss * x1
    s_gra += gra**2                         #求 adagrad
    ada = np.sqrt(s_gra)                    #...
    w = w - l_rate * gra/ada                #得出下一步的点
    print('iteration: %d | Cost: %f  ' % ( i,cost_a))


# save model
np.save('model.npy',w)
# read model
w = np.load('model.npy')

#读测试数据
test_x = []
n_row = 0
text = open('C:/Users/liky/Desktop/PY/2017fall-ml-hw1-master/2017fall-ml-hw1-master/test.csv' ,"r")
row = csv.reader(text , delimiter= ",")

for r in row:
    if n_row %18 == 0:
        test_x.append([])
        for i in range(2,11):
            test_x[n_row//18].append(float(r[i]) )           #把每天的九个小时的数据作为一组存到一个test_x里
    else :
        for i in range(2,11):
            if r[i] !="NR":
                test_x[n_row//18].append(float(r[i]))
            else:
                test_x[n_row//18].append(0)
    n_row = n_row+1
text.close()
test_x = np.array(test_x)

# add square term
# test_x = np.concatenate((test_x,test_x**2), axis=1)

# add bias
test_x = np.concatenate((np.ones((test_x.shape[0],1)),test_x), axis=1)

#得到结果
ans = []
for i in range(len(test_x)):
    ans.append(["id_"+str(i)])
    a = np.dot(w,test_x[i])
    ans[i].append(a)



filename = "C:/Users/liky/Desktop/PY/2017fall-ml-hw1-master/2017fall-ml-hw1-master/predict.csv"
text = open(filename, "w+")
s = csv.writer(text,delimiter=',',lineterminator='\n')
s.writerow(["id","value"])
for i in range(len(ans)):
    s.writerow(ans[i])
text.close()

猜你喜欢

转载自blog.csdn.net/li_k_y/article/details/84104103