先来从LinearRegression的使用开始，代码如下：

from sklearn import linear_model as lm
import numpy as np
import os
import pandas as pd


def read_data(path):
    """
    使用pandas读取数据
    """

    return pd.read_csv(path)


def train_model(train_data, features, labels):
    """
    根据训练数据集训练模型，并返回训练好的模型
    :param train_data: 
    :param features: 
    :param labels:  
    :return:
    """

    model = lm.LinearRegression()
    model.fit(train_data[features], train_data[labels])
    print(model.intercept_)
    print(model.coef_)
    
    return model


def linear_model(data, data_number):
    """

    :param data:
    :return:
    """
    # 特征的名称，和数据文件中第一行标题行对应
    features = ["x"]
    # 标签名称，和数据文件中第一行标题行对应
    labels = ["y"]
    # 将数据分为训练数据集和测试数据集，以data_number为分割线，下标0~data_number的为训练集
    train_data = data[:data_number]
    test_data = data[data_number:]
    # 训练模型
    model = train_model(train_data, features, labels)


if __name__ == "__main__":
    home_path = os.path.dirname(os.path.abspath(__file__))
    # Windows下的存储路径与Linux并不相同
    if os.name == "nt":
        dataPath = "%s\\data\\simple_example.csv" % home_path
    else:
        dataPath = "%s/data/simple_example.csv" % home_path
    data = read_data(dataPath)

    linear_model(data, data_number=15)

simple_example.csv数据文件内容如下：

x,y
10,7.7
10,9.87
11,11.18
12,10.43
13,12.36
14,14.15
15,15.73
16,16.4
17,18.86
18,16.13
19,18.21
20,18.37
21,22.61
22,19.83

程序基本思路

（1）使用pandas读取csv格式数据，读取前15条，由参数data_number设定；

（2）训练模型，主要分为两步，第一步定义一个线性回归模型，第二步使用读取的数据进行拟合，核心代码如下：

model = lm.LinearRegression()
model.fit(train_data[features], train_data[labels])

程序拟合结果如下：

y=1.01211289x-0.62794705

可以通过fit方法看到，核心调用的是如下这个方法：

linalg.lstsq(X, y)

它是scipy包提供的一个使用最小二乘法求解Ax=b问题解的方法，该方法在scipy源码的scipy/scipy/linalg/basic.py中，整个源码如下：

# Linear Least Squares
def lstsq(a, b, cond=None, overwrite_a=False, overwrite_b=False,
          check_finite=True, lapack_driver=None):
    """
    省略了注释...

    """
    a1 = _asarray_validated(a, check_finite=check_finite)
    b1 = _asarray_validated(b, check_finite=check_finite)
    if len(a1.shape) != 2:
        raise ValueError('Input array a should be 2-D')
    m, n = a1.shape
    if len(b1.shape) == 2:
        nrhs = b1.shape[1]
    else:
        nrhs = 1
    if m != b1.shape[0]:
        raise ValueError('Shape mismatch: a and b should have the same number'
                         ' of rows ({} != {}).'.format(m, b1.shape[0]))
    if m == 0 or n == 0:  # Zero-sized problem, confuses LAPACK
        x = np.zeros((n,) + b1.shape[1:], dtype=np.common_type(a1, b1))
        if n == 0:
            residues = np.linalg.norm(b1, axis=0)**2
        else:
            residues = np.empty((0,))
        return x, residues, 0, np.empty((0,))

    driver = lapack_driver
    if driver is None:
        driver = lstsq.default_lapack_driver
    if driver not in ('gelsd', 'gelsy', 'gelss'):
        raise ValueError('LAPACK driver "%s" is not found' % driver)

    lapack_func, lapack_lwork = get_lapack_funcs((driver,
                                                 '%s_lwork' % driver),
                                                 (a1, b1))
    real_data = True if (lapack_func.dtype.kind == 'f') else False

    if m < n:
        # need to extend b matrix as it will be filled with
        # a larger solution matrix
        if len(b1.shape) == 2:
            b2 = np.zeros((n, nrhs), dtype=lapack_func.dtype)
            b2[:m, :] = b1
        else:
            b2 = np.zeros(n, dtype=lapack_func.dtype)
            b2[:m] = b1
        b1 = b2

    overwrite_a = overwrite_a or _datacopied(a1, a)
    overwrite_b = overwrite_b or _datacopied(b1, b)

    if cond is None:
        cond = np.finfo(lapack_func.dtype).eps

    if driver in ('gelss', 'gelsd'):
        if driver == 'gelss':
            lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
            v, x, s, rank, work, info = lapack_func(a1, b1, cond, lwork,
                                                    overwrite_a=overwrite_a,
                                                    overwrite_b=overwrite_b)

        elif driver == 'gelsd':
            if real_data:
                lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
                x, s, rank, info = lapack_func(a1, b1, lwork,
                                               iwork, cond, False, False)
            else:  # complex data
                lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
                                                     nrhs, cond)
                x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
                                               cond, False, False)
        if info > 0:
            raise LinAlgError("SVD did not converge in Linear Least Squares")
        if info < 0:
            raise ValueError('illegal value in %d-th argument of internal %s'
                             % (-info, lapack_driver))
        resids = np.asarray([], dtype=x.dtype)
        if m > n:
            x1 = x[:n]
            if rank == n:
                resids = np.sum(np.abs(x[n:])**2, axis=0)
            x = x1
        return x, resids, rank, s

    elif driver == 'gelsy':
        lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
        jptv = np.zeros((a1.shape[1], 1), dtype=np.int32)
        v, x, j, rank, info = lapack_func(a1, b1, jptv, cond,
                                          lwork, False, False)
        if info < 0:
            raise ValueError("illegal value in %d-th argument of internal "
                             "gelsy" % -info)
        if m > n:
            x1 = x[:n]
            x = x1
        return x, np.array([], x.dtype), rank, None


lstsq.default_lapack_driver = 'gelsd'

提供了三种求解方式：'gelsd', 'gelsy', 'gelss'，我们发现在例子中并没有提供参数传入想要使用的方式，默认使用gelsd。所以我们可以聚焦到如下这段代码：

elif driver == 'gelsd':
            if real_data:
                lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
                x, s, rank, info = lapack_func(a1, b1, lwork,
                                               iwork, cond, False, False)
            else:  # complex data
                lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
                                                     nrhs, cond)
                x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
                                               cond, False, False)

给定的是实数，所以调用if中的_compute_lwork和lapack_func方法。_compute_lwork在与basic.py同一目录下的lapack.py中，源码如下：

def _compute_lwork(routine, *args, **kwargs):
    """
    Round floating-point lwork returned by lapack to integer.

    Several LAPACK routines compute optimal values for LWORK, which
    they return in a floating-point variable. However, for large
    values of LWORK, single-precision floating point is not sufficient
    to hold the exact value --- some LAPACK versions (<= 3.5.0 at
    least) truncate the returned integer to single precision and in
    some cases this can be smaller than the required value.

    Examples
    --------
    >>> from scipy.linalg import lapack
    >>> n = 5000
    >>> s_r, s_lw = lapack.get_lapack_funcs(('sysvx', 'sysvx_lwork'))
    >>> lwork = lapack._compute_lwork(s_lw, n)
    >>> lwork
    32000

    """
    wi = routine(*args, **kwargs)
    if len(wi) < 2:
        raise ValueError('')
    info = wi[-1]
    if info != 0:
        raise ValueError("Internal work array size computation failed: "
                         "%d" % (info,))

    lwork = [w.real for w in wi[:-1]]

    dtype = getattr(routine, 'dtype', None)
    if dtype == _np.float32 or dtype == _np.complex64:
        # Single-precision routine -- take next fp value to work
        # around possible truncation in LAPACK code
        lwork = _np.nextafter(lwork, _np.inf, dtype=_np.float32)

    lwork = _np.array(lwork, _np.int64)
    if _np.any(_np.logical_or(lwork < 0, lwork > _np.iinfo(_np.int32).max)):
        raise ValueError("Too large work array required -- computation cannot "
                         "be performed with standard 32-bit LAPACK.")
    lwork = lwork.astype(_np.int32)
    if lwork.size == 1:
        return lwork[0]
    return lwork

lapack_func是由get_lapack_funcs得到，它也在lapack.py中，源码如下：

def get_lapack_funcs(names, arrays=(), dtype=None):
    """
    省略部分注释...
    In LAPACK, the naming convention is that all functions start with a
    type prefix, which depends on the type of the principal
    matrix. These can be one of {'s', 'd', 'c', 'z'} for the numpy
    types {float32, float64, complex64, complex128} respectively, and
    are stored in attribute ``typecode`` of the returned functions.

    """
    return _get_funcs(names, arrays, dtype,
                      "LAPACK", _flapack, _clapack,
                      "flapack", "clapack", _lapack_alias)

实际调用的是_get_funcs函数，从注释中我们看到，LAPACK的命名习惯在所有函数以类型作为前缀，这个类型由初始矩阵决定，这里给出了's'，'d'，'c'，'z'四种类型，我们的数据是float64，正好对应'd'。底层调用的函数名是：

scipy.linalg.lapack.dgelsd

如图：

实际调用的是一个fortran对象。而这个函数就在同目录下的flapack_gen.pyf.src中，它使用fortran语言编写，源码如下：

subroutine <prefix2>gelsd(m,n,minmn,maxmn,nrhs,a,b,s,cond,r,work,lwork,size_iwork,iwork,info)
    ! x,s,rank,info = dgelsd(a,b,lwork,size_iwork,cond=-1.0,overwrite_a=True,overwrite_b=True)
    ! Solve Minimize 2-norm(A * X - B).

    callstatement (*f2py_func)(&m,&n,&nrhs,a,&m,b,&maxmn,s,&cond,&r,work,&lwork,iwork,&info)
    callprotoargument int*,int*,int*,<ctype2>*,int*,<ctype2>*,int*,<ctype2>*,<ctype2>*,int*,<ctype2>*,int*,int*,int*

    integer intent(hide),depend(a):: m = shape(a,0)
    integer intent(hide),depend(a):: n = shape(a,1)
    integer intent(hide),depend(m,n):: minmn = MIN(m,n)
    integer intent(hide),depend(m,n):: maxmn = MAX(m,n)
    <ftype2> dimension(m,n),intent(in,copy) :: a

    integer depend(b),intent(hide):: nrhs = shape(b,1)
    <ftype2> dimension(maxmn,nrhs),check(maxmn==shape(b,0)),depend(maxmn) :: b
    intent(in,out,copy,out=x) b

    <ftype2> intent(in),optional :: cond=-1.0
    integer intent(out,out=rank) :: r
    <ftype2> intent(out),dimension(minmn),depend(minmn) :: s

    integer intent(in),check(lwork>=1) :: lwork
    ! Impossible to calculate lwork explicitly, need to obtain it from query call first
    ! Same for size_iwork
    <ftype2> dimension(lwork),intent(cache,hide),depend(lwork) :: work

    integer intent(in) :: size_iwork
    integer intent(cache,hide),dimension(MAX(1,size_iwork)),depend(size_iwork) :: iwork
    integer intent(out)::info

end subroutine <prefix2>gelsd

leboop

发布了75 篇原创文章 · 获赞 54 · 访问量 8万+

私信关注

sklearn.linear_model之LinearRegression核心源码解析

scipy.linalg.lapack.dgelsd

猜你喜欢