sklearn.linear_model la fuente núcleo código de análisis LinearRegression

A partir del código LinearRegression primero en uso es el siguiente:

from sklearn import linear_model as lm
import numpy as np
import os
import pandas as pd


def read_data(path):
    """
    使用pandas读取数据
    """

    return pd.read_csv(path)


def train_model(train_data, features, labels):
    """
    根据训练数据集训练模型,并返回训练好的模型
    :param train_data: 
    :param features: 
    :param labels:  
    :return:
    """

    model = lm.LinearRegression()
    model.fit(train_data[features], train_data[labels])
    print(model.intercept_)
    print(model.coef_)
    
    return model


def linear_model(data, data_number):
    """

    :param data:
    :return:
    """
    # 特征的名称,和数据文件中第一行标题行对应
    features = ["x"]
    # 标签名称,和数据文件中第一行标题行对应
    labels = ["y"]
    # 将数据分为训练数据集和测试数据集,以data_number为分割线,下标0~data_number的为训练集
    train_data = data[:data_number]
    test_data = data[data_number:]
    # 训练模型
    model = train_model(train_data, features, labels)


if __name__ == "__main__":
    home_path = os.path.dirname(os.path.abspath(__file__))
    # Windows下的存储路径与Linux并不相同
    if os.name == "nt":
        dataPath = "%s\\data\\simple_example.csv" % home_path
    else:
        dataPath = "%s/data/simple_example.csv" % home_path
    data = read_data(dataPath)

    linear_model(data, data_number=15)

simple_example.csv datos documento dice lo siguiente:

x,y
10,7.7
10,9.87
11,11.18
12,10.43
13,12.36
14,14.15
15,15.73
16,16.4
17,18.86
18,16.13
19,18.21
20,18.37
21,22.61
22,19.83

La idea básica del programa

(1) usando pandas csv datos de formato leer, leer antes de 15, fijada por el data_number parámetro;

(2) modelo de formación, los datos se divide en dos pasos, el primer paso en la definición de un modelo de regresión lineal es ajuste usando el segundo paso de lectura, el código del núcleo es la siguiente:

Modelo = lm.LinearRegression ()
model.fit (train_data [características], train_data [etiquetas])

los resultados del programa de ajuste son los siguientes:

y=1.01211289x-0.62794705

 

Puede ser visto a través de método de ajuste, el núcleo de esta llamada de método es el siguiente:

linalg.lstsq(X, y)

Es paquete scipy proporciona un método para Ax = b Soluciones de resolución de método de mínimos cuadrados, el método fuente scipy scipy / scipy / linalg / basic.py, todo el código fuente como sigue:

# Linear Least Squares
def lstsq(a, b, cond=None, overwrite_a=False, overwrite_b=False,
          check_finite=True, lapack_driver=None):
    """
    省略了注释...

    """
    a1 = _asarray_validated(a, check_finite=check_finite)
    b1 = _asarray_validated(b, check_finite=check_finite)
    if len(a1.shape) != 2:
        raise ValueError('Input array a should be 2-D')
    m, n = a1.shape
    if len(b1.shape) == 2:
        nrhs = b1.shape[1]
    else:
        nrhs = 1
    if m != b1.shape[0]:
        raise ValueError('Shape mismatch: a and b should have the same number'
                         ' of rows ({} != {}).'.format(m, b1.shape[0]))
    if m == 0 or n == 0:  # Zero-sized problem, confuses LAPACK
        x = np.zeros((n,) + b1.shape[1:], dtype=np.common_type(a1, b1))
        if n == 0:
            residues = np.linalg.norm(b1, axis=0)**2
        else:
            residues = np.empty((0,))
        return x, residues, 0, np.empty((0,))

    driver = lapack_driver
    if driver is None:
        driver = lstsq.default_lapack_driver
    if driver not in ('gelsd', 'gelsy', 'gelss'):
        raise ValueError('LAPACK driver "%s" is not found' % driver)

    lapack_func, lapack_lwork = get_lapack_funcs((driver,
                                                 '%s_lwork' % driver),
                                                 (a1, b1))
    real_data = True if (lapack_func.dtype.kind == 'f') else False

    if m < n:
        # need to extend b matrix as it will be filled with
        # a larger solution matrix
        if len(b1.shape) == 2:
            b2 = np.zeros((n, nrhs), dtype=lapack_func.dtype)
            b2[:m, :] = b1
        else:
            b2 = np.zeros(n, dtype=lapack_func.dtype)
            b2[:m] = b1
        b1 = b2

    overwrite_a = overwrite_a or _datacopied(a1, a)
    overwrite_b = overwrite_b or _datacopied(b1, b)

    if cond is None:
        cond = np.finfo(lapack_func.dtype).eps

    if driver in ('gelss', 'gelsd'):
        if driver == 'gelss':
            lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
            v, x, s, rank, work, info = lapack_func(a1, b1, cond, lwork,
                                                    overwrite_a=overwrite_a,
                                                    overwrite_b=overwrite_b)

        elif driver == 'gelsd':
            if real_data:
                lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
                x, s, rank, info = lapack_func(a1, b1, lwork,
                                               iwork, cond, False, False)
            else:  # complex data
                lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
                                                     nrhs, cond)
                x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
                                               cond, False, False)
        if info > 0:
            raise LinAlgError("SVD did not converge in Linear Least Squares")
        if info < 0:
            raise ValueError('illegal value in %d-th argument of internal %s'
                             % (-info, lapack_driver))
        resids = np.asarray([], dtype=x.dtype)
        if m > n:
            x1 = x[:n]
            if rank == n:
                resids = np.sum(np.abs(x[n:])**2, axis=0)
            x = x1
        return x, resids, rank, s

    elif driver == 'gelsy':
        lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
        jptv = np.zeros((a1.shape[1], 1), dtype=np.int32)
        v, x, j, rank, info = lapack_func(a1, b1, jptv, cond,
                                          lwork, False, False)
        if info < 0:
            raise ValueError("illegal value in %d-th argument of internal "
                             "gelsy" % -info)
        if m > n:
            x1 = x[:n]
            x = x1
        return x, np.array([], x.dtype), rank, None


lstsq.default_lapack_driver = 'gelsd'

Proporciona tres formas de resolver: 'gelsd', 'gelsy', 'gelss', encontramos no proporcionó una forma de pasar parámetros que desea utilizar en el ejemplo, el gelsd defecto. Así podemos centrarnos en el siguiente código:

elif driver == 'gelsd':
            if real_data:
                lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
                x, s, rank, info = lapack_func(a1, b1, lwork,
                                               iwork, cond, False, False)
            else:  # complex data
                lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
                                                     nrhs, cond)
                x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
                                               cond, False, False)

Dado que el número real, así que llame si el _compute_lwork y el método lapack_func. _compute_lwork en lapack.py bajo basic.py el mismo directorio, el código fuente es el siguiente:

def _compute_lwork(routine, *args, **kwargs):
    """
    Round floating-point lwork returned by lapack to integer.

    Several LAPACK routines compute optimal values for LWORK, which
    they return in a floating-point variable. However, for large
    values of LWORK, single-precision floating point is not sufficient
    to hold the exact value --- some LAPACK versions (<= 3.5.0 at
    least) truncate the returned integer to single precision and in
    some cases this can be smaller than the required value.

    Examples
    --------
    >>> from scipy.linalg import lapack
    >>> n = 5000
    >>> s_r, s_lw = lapack.get_lapack_funcs(('sysvx', 'sysvx_lwork'))
    >>> lwork = lapack._compute_lwork(s_lw, n)
    >>> lwork
    32000

    """
    wi = routine(*args, **kwargs)
    if len(wi) < 2:
        raise ValueError('')
    info = wi[-1]
    if info != 0:
        raise ValueError("Internal work array size computation failed: "
                         "%d" % (info,))

    lwork = [w.real for w in wi[:-1]]

    dtype = getattr(routine, 'dtype', None)
    if dtype == _np.float32 or dtype == _np.complex64:
        # Single-precision routine -- take next fp value to work
        # around possible truncation in LAPACK code
        lwork = _np.nextafter(lwork, _np.inf, dtype=_np.float32)

    lwork = _np.array(lwork, _np.int64)
    if _np.any(_np.logical_or(lwork < 0, lwork > _np.iinfo(_np.int32).max)):
        raise ValueError("Too large work array required -- computation cannot "
                         "be performed with standard 32-bit LAPACK.")
    lwork = lwork.astype(_np.int32)
    if lwork.size == 1:
        return lwork[0]
    return lwork

lapack_func se deriva de get_lapack_funcs, también lapack.py, el código fuente es el siguiente:

def get_lapack_funcs(names, arrays=(), dtype=None):
    """
    省略部分注释...
    In LAPACK, the naming convention is that all functions start with a
    type prefix, which depends on the type of the principal
    matrix. These can be one of {'s', 'd', 'c', 'z'} for the numpy
    types {float32, float64, complex64, complex128} respectively, and
    are stored in attribute ``typecode`` of the returned functions.

    """
    return _get_funcs(names, arrays, dtype,
                      "LAPACK", _flapack, _clapack,
                      "flapack", "clapack", _lapack_alias)

La llamada de función real es _get_funcs, vemos de las anotaciones, la convención de nomenclatura LAPACK en todos los tipos de funciones a un prefijo, esto está determinado por el tipo de matriz inicial, aquí el 's', 'd', 'c',' z 'de cuatro tipos, nuestros datos son float64, que corresponde exactamente a' d'. La parte inferior del nombre de la función se llama:

scipy.linalg.lapack.dgelsd

Figura:

La llamada real es un objeto FORTRAN. Y esta función en flapack_gen.pyf.src en el mismo directorio, utilizando un lenguaje Fortran, el código fuente de la siguiente manera:

subroutine <prefix2>gelsd(m,n,minmn,maxmn,nrhs,a,b,s,cond,r,work,lwork,size_iwork,iwork,info)
    ! x,s,rank,info = dgelsd(a,b,lwork,size_iwork,cond=-1.0,overwrite_a=True,overwrite_b=True)
    ! Solve Minimize 2-norm(A * X - B).

    callstatement (*f2py_func)(&m,&n,&nrhs,a,&m,b,&maxmn,s,&cond,&r,work,&lwork,iwork,&info)
    callprotoargument int*,int*,int*,<ctype2>*,int*,<ctype2>*,int*,<ctype2>*,<ctype2>*,int*,<ctype2>*,int*,int*,int*

    integer intent(hide),depend(a):: m = shape(a,0)
    integer intent(hide),depend(a):: n = shape(a,1)
    integer intent(hide),depend(m,n):: minmn = MIN(m,n)
    integer intent(hide),depend(m,n):: maxmn = MAX(m,n)
    <ftype2> dimension(m,n),intent(in,copy) :: a

    integer depend(b),intent(hide):: nrhs = shape(b,1)
    <ftype2> dimension(maxmn,nrhs),check(maxmn==shape(b,0)),depend(maxmn) :: b
    intent(in,out,copy,out=x) b

    <ftype2> intent(in),optional :: cond=-1.0
    integer intent(out,out=rank) :: r
    <ftype2> intent(out),dimension(minmn),depend(minmn) :: s

    integer intent(in),check(lwork>=1) :: lwork
    ! Impossible to calculate lwork explicitly, need to obtain it from query call first
    ! Same for size_iwork
    <ftype2> dimension(lwork),intent(cache,hide),depend(lwork) :: work

    integer intent(in) :: size_iwork
    integer intent(cache,hide),dimension(MAX(1,size_iwork)),depend(size_iwork) :: iwork
    integer intent(out)::info

end subroutine <prefix2>gelsd

 

Publicado 75 artículos originales · ganado elogios 54 · Vistas a 80000 +

Supongo que te gusta

Origin blog.csdn.net/L_15156024189/article/details/104774649
Recomendado
Clasificación