先来从LinearRegression的使用开始,代码如下:
from sklearn import linear_model as lm
import numpy as np
import os
import pandas as pd
def read_data(path):
"""
使用pandas读取数据
"""
return pd.read_csv(path)
def train_model(train_data, features, labels):
"""
根据训练数据集训练模型,并返回训练好的模型
:param train_data:
:param features:
:param labels:
:return:
"""
model = lm.LinearRegression()
model.fit(train_data[features], train_data[labels])
print(model.intercept_)
print(model.coef_)
return model
def linear_model(data, data_number):
"""
:param data:
:return:
"""
# 特征的名称,和数据文件中第一行标题行对应
features = ["x"]
# 标签名称,和数据文件中第一行标题行对应
labels = ["y"]
# 将数据分为训练数据集和测试数据集,以data_number为分割线,下标0~data_number的为训练集
train_data = data[:data_number]
test_data = data[data_number:]
# 训练模型
model = train_model(train_data, features, labels)
if __name__ == "__main__":
home_path = os.path.dirname(os.path.abspath(__file__))
# Windows下的存储路径与Linux并不相同
if os.name == "nt":
dataPath = "%s\\data\\simple_example.csv" % home_path
else:
dataPath = "%s/data/simple_example.csv" % home_path
data = read_data(dataPath)
linear_model(data, data_number=15)
simple_example.csv数据文件内容如下:
x,y
10,7.7
10,9.87
11,11.18
12,10.43
13,12.36
14,14.15
15,15.73
16,16.4
17,18.86
18,16.13
19,18.21
20,18.37
21,22.61
22,19.83
程序基本思路
(1)使用pandas读取csv格式数据,读取前15条,由参数data_number设定;
(2)训练模型,主要分为两步,第一步定义一个线性回归模型,第二步使用读取的数据进行拟合,核心代码如下:
model = lm.LinearRegression()
model.fit(train_data[features], train_data[labels])
程序拟合结果如下:
y=1.01211289x-0.62794705
可以通过fit方法看到,核心调用的是如下这个方法:
linalg.lstsq(X, y)
它是scipy包提供的一个使用最小二乘法求解Ax=b问题解的方法,该方法在scipy源码的scipy/scipy/linalg/basic.py中,整个源码如下:
# Linear Least Squares
def lstsq(a, b, cond=None, overwrite_a=False, overwrite_b=False,
check_finite=True, lapack_driver=None):
"""
省略了注释...
"""
a1 = _asarray_validated(a, check_finite=check_finite)
b1 = _asarray_validated(b, check_finite=check_finite)
if len(a1.shape) != 2:
raise ValueError('Input array a should be 2-D')
m, n = a1.shape
if len(b1.shape) == 2:
nrhs = b1.shape[1]
else:
nrhs = 1
if m != b1.shape[0]:
raise ValueError('Shape mismatch: a and b should have the same number'
' of rows ({} != {}).'.format(m, b1.shape[0]))
if m == 0 or n == 0: # Zero-sized problem, confuses LAPACK
x = np.zeros((n,) + b1.shape[1:], dtype=np.common_type(a1, b1))
if n == 0:
residues = np.linalg.norm(b1, axis=0)**2
else:
residues = np.empty((0,))
return x, residues, 0, np.empty((0,))
driver = lapack_driver
if driver is None:
driver = lstsq.default_lapack_driver
if driver not in ('gelsd', 'gelsy', 'gelss'):
raise ValueError('LAPACK driver "%s" is not found' % driver)
lapack_func, lapack_lwork = get_lapack_funcs((driver,
'%s_lwork' % driver),
(a1, b1))
real_data = True if (lapack_func.dtype.kind == 'f') else False
if m < n:
# need to extend b matrix as it will be filled with
# a larger solution matrix
if len(b1.shape) == 2:
b2 = np.zeros((n, nrhs), dtype=lapack_func.dtype)
b2[:m, :] = b1
else:
b2 = np.zeros(n, dtype=lapack_func.dtype)
b2[:m] = b1
b1 = b2
overwrite_a = overwrite_a or _datacopied(a1, a)
overwrite_b = overwrite_b or _datacopied(b1, b)
if cond is None:
cond = np.finfo(lapack_func.dtype).eps
if driver in ('gelss', 'gelsd'):
if driver == 'gelss':
lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
v, x, s, rank, work, info = lapack_func(a1, b1, cond, lwork,
overwrite_a=overwrite_a,
overwrite_b=overwrite_b)
elif driver == 'gelsd':
if real_data:
lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork,
iwork, cond, False, False)
else: # complex data
lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
cond, False, False)
if info > 0:
raise LinAlgError("SVD did not converge in Linear Least Squares")
if info < 0:
raise ValueError('illegal value in %d-th argument of internal %s'
% (-info, lapack_driver))
resids = np.asarray([], dtype=x.dtype)
if m > n:
x1 = x[:n]
if rank == n:
resids = np.sum(np.abs(x[n:])**2, axis=0)
x = x1
return x, resids, rank, s
elif driver == 'gelsy':
lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
jptv = np.zeros((a1.shape[1], 1), dtype=np.int32)
v, x, j, rank, info = lapack_func(a1, b1, jptv, cond,
lwork, False, False)
if info < 0:
raise ValueError("illegal value in %d-th argument of internal "
"gelsy" % -info)
if m > n:
x1 = x[:n]
x = x1
return x, np.array([], x.dtype), rank, None
lstsq.default_lapack_driver = 'gelsd'
提供了三种求解方式:'gelsd', 'gelsy', 'gelss',我们发现在例子中并没有提供参数传入想要使用的方式,默认使用gelsd。所以我们可以聚焦到如下这段代码:
elif driver == 'gelsd':
if real_data:
lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork,
iwork, cond, False, False)
else: # complex data
lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
cond, False, False)
给定的是实数,所以调用if中的_compute_lwork和lapack_func方法。_compute_lwork在与basic.py同一目录下的lapack.py中,源码如下:
def _compute_lwork(routine, *args, **kwargs):
"""
Round floating-point lwork returned by lapack to integer.
Several LAPACK routines compute optimal values for LWORK, which
they return in a floating-point variable. However, for large
values of LWORK, single-precision floating point is not sufficient
to hold the exact value --- some LAPACK versions (<= 3.5.0 at
least) truncate the returned integer to single precision and in
some cases this can be smaller than the required value.
Examples
--------
>>> from scipy.linalg import lapack
>>> n = 5000
>>> s_r, s_lw = lapack.get_lapack_funcs(('sysvx', 'sysvx_lwork'))
>>> lwork = lapack._compute_lwork(s_lw, n)
>>> lwork
32000
"""
wi = routine(*args, **kwargs)
if len(wi) < 2:
raise ValueError('')
info = wi[-1]
if info != 0:
raise ValueError("Internal work array size computation failed: "
"%d" % (info,))
lwork = [w.real for w in wi[:-1]]
dtype = getattr(routine, 'dtype', None)
if dtype == _np.float32 or dtype == _np.complex64:
# Single-precision routine -- take next fp value to work
# around possible truncation in LAPACK code
lwork = _np.nextafter(lwork, _np.inf, dtype=_np.float32)
lwork = _np.array(lwork, _np.int64)
if _np.any(_np.logical_or(lwork < 0, lwork > _np.iinfo(_np.int32).max)):
raise ValueError("Too large work array required -- computation cannot "
"be performed with standard 32-bit LAPACK.")
lwork = lwork.astype(_np.int32)
if lwork.size == 1:
return lwork[0]
return lwork
lapack_func是由get_lapack_funcs得到,它也在lapack.py中,源码如下:
def get_lapack_funcs(names, arrays=(), dtype=None):
"""
省略部分注释...
In LAPACK, the naming convention is that all functions start with a
type prefix, which depends on the type of the principal
matrix. These can be one of {'s', 'd', 'c', 'z'} for the numpy
types {float32, float64, complex64, complex128} respectively, and
are stored in attribute ``typecode`` of the returned functions.
"""
return _get_funcs(names, arrays, dtype,
"LAPACK", _flapack, _clapack,
"flapack", "clapack", _lapack_alias)
实际调用的是_get_funcs函数,从注释中我们看到,LAPACK的命名习惯在所有函数以类型作为前缀,这个类型由初始矩阵决定,这里给出了's','d','c','z'四种类型,我们的数据是float64,正好对应'd'。底层调用的函数名是:
scipy.linalg.lapack.dgelsd
如图:
实际调用的是一个fortran对象。而这个函数就在同目录下的flapack_gen.pyf.src中,它使用fortran语言编写,源码如下:
subroutine <prefix2>gelsd(m,n,minmn,maxmn,nrhs,a,b,s,cond,r,work,lwork,size_iwork,iwork,info)
! x,s,rank,info = dgelsd(a,b,lwork,size_iwork,cond=-1.0,overwrite_a=True,overwrite_b=True)
! Solve Minimize 2-norm(A * X - B).
callstatement (*f2py_func)(&m,&n,&nrhs,a,&m,b,&maxmn,s,&cond,&r,work,&lwork,iwork,&info)
callprotoargument int*,int*,int*,<ctype2>*,int*,<ctype2>*,int*,<ctype2>*,<ctype2>*,int*,<ctype2>*,int*,int*,int*
integer intent(hide),depend(a):: m = shape(a,0)
integer intent(hide),depend(a):: n = shape(a,1)
integer intent(hide),depend(m,n):: minmn = MIN(m,n)
integer intent(hide),depend(m,n):: maxmn = MAX(m,n)
<ftype2> dimension(m,n),intent(in,copy) :: a
integer depend(b),intent(hide):: nrhs = shape(b,1)
<ftype2> dimension(maxmn,nrhs),check(maxmn==shape(b,0)),depend(maxmn) :: b
intent(in,out,copy,out=x) b
<ftype2> intent(in),optional :: cond=-1.0
integer intent(out,out=rank) :: r
<ftype2> intent(out),dimension(minmn),depend(minmn) :: s
integer intent(in),check(lwork>=1) :: lwork
! Impossible to calculate lwork explicitly, need to obtain it from query call first
! Same for size_iwork
<ftype2> dimension(lwork),intent(cache,hide),depend(lwork) :: work
integer intent(in) :: size_iwork
integer intent(cache,hide),dimension(MAX(1,size_iwork)),depend(size_iwork) :: iwork
integer intent(out)::info
end subroutine <prefix2>gelsd