A partir del código LinearRegression primero en uso es el siguiente:
from sklearn import linear_model as lm
import numpy as np
import os
import pandas as pd
def read_data(path):
"""
使用pandas读取数据
"""
return pd.read_csv(path)
def train_model(train_data, features, labels):
"""
根据训练数据集训练模型,并返回训练好的模型
:param train_data:
:param features:
:param labels:
:return:
"""
model = lm.LinearRegression()
model.fit(train_data[features], train_data[labels])
print(model.intercept_)
print(model.coef_)
return model
def linear_model(data, data_number):
"""
:param data:
:return:
"""
# 特征的名称,和数据文件中第一行标题行对应
features = ["x"]
# 标签名称,和数据文件中第一行标题行对应
labels = ["y"]
# 将数据分为训练数据集和测试数据集,以data_number为分割线,下标0~data_number的为训练集
train_data = data[:data_number]
test_data = data[data_number:]
# 训练模型
model = train_model(train_data, features, labels)
if __name__ == "__main__":
home_path = os.path.dirname(os.path.abspath(__file__))
# Windows下的存储路径与Linux并不相同
if os.name == "nt":
dataPath = "%s\\data\\simple_example.csv" % home_path
else:
dataPath = "%s/data/simple_example.csv" % home_path
data = read_data(dataPath)
linear_model(data, data_number=15)
simple_example.csv datos documento dice lo siguiente:
x,y
10,7.7
10,9.87
11,11.18
12,10.43
13,12.36
14,14.15
15,15.73
16,16.4
17,18.86
18,16.13
19,18.21
20,18.37
21,22.61
22,19.83
La idea básica del programa
(1) usando pandas csv datos de formato leer, leer antes de 15, fijada por el data_number parámetro;
(2) modelo de formación, los datos se divide en dos pasos, el primer paso en la definición de un modelo de regresión lineal es ajuste usando el segundo paso de lectura, el código del núcleo es la siguiente:
Modelo = lm.LinearRegression ()
model.fit (train_data [características], train_data [etiquetas])
los resultados del programa de ajuste son los siguientes:
y=1.01211289x-0.62794705
Puede ser visto a través de método de ajuste, el núcleo de esta llamada de método es el siguiente:
linalg.lstsq(X, y)
Es paquete scipy proporciona un método para Ax = b Soluciones de resolución de método de mínimos cuadrados, el método fuente scipy scipy / scipy / linalg / basic.py, todo el código fuente como sigue:
# Linear Least Squares
def lstsq(a, b, cond=None, overwrite_a=False, overwrite_b=False,
check_finite=True, lapack_driver=None):
"""
省略了注释...
"""
a1 = _asarray_validated(a, check_finite=check_finite)
b1 = _asarray_validated(b, check_finite=check_finite)
if len(a1.shape) != 2:
raise ValueError('Input array a should be 2-D')
m, n = a1.shape
if len(b1.shape) == 2:
nrhs = b1.shape[1]
else:
nrhs = 1
if m != b1.shape[0]:
raise ValueError('Shape mismatch: a and b should have the same number'
' of rows ({} != {}).'.format(m, b1.shape[0]))
if m == 0 or n == 0: # Zero-sized problem, confuses LAPACK
x = np.zeros((n,) + b1.shape[1:], dtype=np.common_type(a1, b1))
if n == 0:
residues = np.linalg.norm(b1, axis=0)**2
else:
residues = np.empty((0,))
return x, residues, 0, np.empty((0,))
driver = lapack_driver
if driver is None:
driver = lstsq.default_lapack_driver
if driver not in ('gelsd', 'gelsy', 'gelss'):
raise ValueError('LAPACK driver "%s" is not found' % driver)
lapack_func, lapack_lwork = get_lapack_funcs((driver,
'%s_lwork' % driver),
(a1, b1))
real_data = True if (lapack_func.dtype.kind == 'f') else False
if m < n:
# need to extend b matrix as it will be filled with
# a larger solution matrix
if len(b1.shape) == 2:
b2 = np.zeros((n, nrhs), dtype=lapack_func.dtype)
b2[:m, :] = b1
else:
b2 = np.zeros(n, dtype=lapack_func.dtype)
b2[:m] = b1
b1 = b2
overwrite_a = overwrite_a or _datacopied(a1, a)
overwrite_b = overwrite_b or _datacopied(b1, b)
if cond is None:
cond = np.finfo(lapack_func.dtype).eps
if driver in ('gelss', 'gelsd'):
if driver == 'gelss':
lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
v, x, s, rank, work, info = lapack_func(a1, b1, cond, lwork,
overwrite_a=overwrite_a,
overwrite_b=overwrite_b)
elif driver == 'gelsd':
if real_data:
lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork,
iwork, cond, False, False)
else: # complex data
lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
cond, False, False)
if info > 0:
raise LinAlgError("SVD did not converge in Linear Least Squares")
if info < 0:
raise ValueError('illegal value in %d-th argument of internal %s'
% (-info, lapack_driver))
resids = np.asarray([], dtype=x.dtype)
if m > n:
x1 = x[:n]
if rank == n:
resids = np.sum(np.abs(x[n:])**2, axis=0)
x = x1
return x, resids, rank, s
elif driver == 'gelsy':
lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
jptv = np.zeros((a1.shape[1], 1), dtype=np.int32)
v, x, j, rank, info = lapack_func(a1, b1, jptv, cond,
lwork, False, False)
if info < 0:
raise ValueError("illegal value in %d-th argument of internal "
"gelsy" % -info)
if m > n:
x1 = x[:n]
x = x1
return x, np.array([], x.dtype), rank, None
lstsq.default_lapack_driver = 'gelsd'
Proporciona tres formas de resolver: 'gelsd', 'gelsy', 'gelss', encontramos no proporcionó una forma de pasar parámetros que desea utilizar en el ejemplo, el gelsd defecto. Así podemos centrarnos en el siguiente código:
elif driver == 'gelsd':
if real_data:
lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork,
iwork, cond, False, False)
else: # complex data
lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
cond, False, False)
Dado que el número real, así que llame si el _compute_lwork y el método lapack_func. _compute_lwork en lapack.py bajo basic.py el mismo directorio, el código fuente es el siguiente:
def _compute_lwork(routine, *args, **kwargs):
"""
Round floating-point lwork returned by lapack to integer.
Several LAPACK routines compute optimal values for LWORK, which
they return in a floating-point variable. However, for large
values of LWORK, single-precision floating point is not sufficient
to hold the exact value --- some LAPACK versions (<= 3.5.0 at
least) truncate the returned integer to single precision and in
some cases this can be smaller than the required value.
Examples
--------
>>> from scipy.linalg import lapack
>>> n = 5000
>>> s_r, s_lw = lapack.get_lapack_funcs(('sysvx', 'sysvx_lwork'))
>>> lwork = lapack._compute_lwork(s_lw, n)
>>> lwork
32000
"""
wi = routine(*args, **kwargs)
if len(wi) < 2:
raise ValueError('')
info = wi[-1]
if info != 0:
raise ValueError("Internal work array size computation failed: "
"%d" % (info,))
lwork = [w.real for w in wi[:-1]]
dtype = getattr(routine, 'dtype', None)
if dtype == _np.float32 or dtype == _np.complex64:
# Single-precision routine -- take next fp value to work
# around possible truncation in LAPACK code
lwork = _np.nextafter(lwork, _np.inf, dtype=_np.float32)
lwork = _np.array(lwork, _np.int64)
if _np.any(_np.logical_or(lwork < 0, lwork > _np.iinfo(_np.int32).max)):
raise ValueError("Too large work array required -- computation cannot "
"be performed with standard 32-bit LAPACK.")
lwork = lwork.astype(_np.int32)
if lwork.size == 1:
return lwork[0]
return lwork
lapack_func se deriva de get_lapack_funcs, también lapack.py, el código fuente es el siguiente:
def get_lapack_funcs(names, arrays=(), dtype=None):
"""
省略部分注释...
In LAPACK, the naming convention is that all functions start with a
type prefix, which depends on the type of the principal
matrix. These can be one of {'s', 'd', 'c', 'z'} for the numpy
types {float32, float64, complex64, complex128} respectively, and
are stored in attribute ``typecode`` of the returned functions.
"""
return _get_funcs(names, arrays, dtype,
"LAPACK", _flapack, _clapack,
"flapack", "clapack", _lapack_alias)
La llamada de función real es _get_funcs, vemos de las anotaciones, la convención de nomenclatura LAPACK en todos los tipos de funciones a un prefijo, esto está determinado por el tipo de matriz inicial, aquí el 's', 'd', 'c',' z 'de cuatro tipos, nuestros datos son float64, que corresponde exactamente a' d'. La parte inferior del nombre de la función se llama:
scipy.linalg.lapack.dgelsd
Figura:
La llamada real es un objeto FORTRAN. Y esta función en flapack_gen.pyf.src en el mismo directorio, utilizando un lenguaje Fortran, el código fuente de la siguiente manera:
subroutine <prefix2>gelsd(m,n,minmn,maxmn,nrhs,a,b,s,cond,r,work,lwork,size_iwork,iwork,info)
! x,s,rank,info = dgelsd(a,b,lwork,size_iwork,cond=-1.0,overwrite_a=True,overwrite_b=True)
! Solve Minimize 2-norm(A * X - B).
callstatement (*f2py_func)(&m,&n,&nrhs,a,&m,b,&maxmn,s,&cond,&r,work,&lwork,iwork,&info)
callprotoargument int*,int*,int*,<ctype2>*,int*,<ctype2>*,int*,<ctype2>*,<ctype2>*,int*,<ctype2>*,int*,int*,int*
integer intent(hide),depend(a):: m = shape(a,0)
integer intent(hide),depend(a):: n = shape(a,1)
integer intent(hide),depend(m,n):: minmn = MIN(m,n)
integer intent(hide),depend(m,n):: maxmn = MAX(m,n)
<ftype2> dimension(m,n),intent(in,copy) :: a
integer depend(b),intent(hide):: nrhs = shape(b,1)
<ftype2> dimension(maxmn,nrhs),check(maxmn==shape(b,0)),depend(maxmn) :: b
intent(in,out,copy,out=x) b
<ftype2> intent(in),optional :: cond=-1.0
integer intent(out,out=rank) :: r
<ftype2> intent(out),dimension(minmn),depend(minmn) :: s
integer intent(in),check(lwork>=1) :: lwork
! Impossible to calculate lwork explicitly, need to obtain it from query call first
! Same for size_iwork
<ftype2> dimension(lwork),intent(cache,hide),depend(lwork) :: work
integer intent(in) :: size_iwork
integer intent(cache,hide),dimension(MAX(1,size_iwork)),depend(size_iwork) :: iwork
integer intent(out)::info
end subroutine <prefix2>gelsd