Part2-Chapter8-预测乐高玩具套装价格

目标是爬取ebay'上的二手乐高数据,并使用岭回归交叉验证的方式给出回归方程

from bs4 import BeautifulSoup
import numpy as np
import random

def scrapePage(retX, retY, inFile, yr, numPce, origPrc):
	# 打开并读取HTML文件
	with open(inFile, encoding='utf-8') as f:
	html = f.read()
	soup = BeautifulSoup(html)
  	i = 1
  	# 根据HTML页面结构进行解析
	currentRow = soup.find_all('table', r = "%d" % i)
	while(len(currentRow) != 0):
   	 	currentRow = soup.find_all('table', r = "%d" % i)
    	title = currentRow[0].find_all('a')[1].text
    	lwrTitle = title.lower()
    	# 查找是否有全新标签
    	if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
       		 newFlag = 1.0
    	else:
        	newFlag = 0.0
   	 	# 查找是否已经标志出售,我们只收集已出售的数据
    	soldUnicde = currentRow[0].find_all('td')[3].find_all('span')
    	if len(soldUnicde) == 0:
        	print("商品 #%d 没有出售" % i)
    	else:
       	 	# 解析页面获取当前价格
        	soldPrice = currentRow[0].find_all('td')[4]
        	priceStr = soldPrice.text
       	 	priceStr = priceStr.replace('$','')
        	priceStr = priceStr.replace(',','')
        	if len(soldPrice) > 1:
            	priceStr = priceStr.replace('Free shipping', '')
       	 	sellingPrice = float(priceStr)
        	# 去掉不完整的套装价格
       	 	if  sellingPrice > origPrc * 0.5:
            	print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice))
            	retX.append([yr, numPce, newFlag, origPrc])
            	retY.append(sellingPrice)
    	i += 1
    	currentRow = soup.find_all('table', r = "%d" % i)
     
 #分别抓取各网页数据
def setDataCollect(retX, retY):
	scrapePage(retX, retY, 'lego8288.html', 2006, 800, 49.99)             
	scrapePage(retX, retY, 'lego10030.html', 2002, 3096, 269.99)          
	scrapePage(retX, retY, 'lego10179.html', 2007, 5195, 499.99)             
	scrapePage(retX, retY, 'lego10181.html', 2007, 3428, 199.99)               
	scrapePage(retX, retY, 'lego10189.html', 2008, 5922, 299.99)                
	scrapePage(retX, retY, 'lego10196.html', 2009, 3263, 249.99)
	
#标准化
def regularize(xMat,yMat):
	inxMat = xMat.copy()
	inyMat = yMat.copy()
	yMean = np.mean(yMat,0)
	inyMat = yMat - yMean
	inMeans = np.mean(inxMat,0)
	inVar = np.var(inxMat,0)
	print(inMeans)
	inxMat = (inxMat - inMeans)/inVar

计算平方误差
def rssError(yArr,yHatArr):
	return ((yArr - yHatArr)**2).sum()


#计算回归系数W
def standRegres(xArr,yArr):
	xMat = np.mat(xArr)
	yMat = np.mat(yArr).T
	xTx = xMat.T * xMat
	if np.linalg.det(xTx) == 0.0:
    	print("无法求逆")
    	return
	ws = xTx.I * (xMat.T * yMat)
	return ws

#交叉验证岭回归
def crossValidation(xArr,yArr,numVal = 10):
	#得到数据数
	m = len(yArr)
	#建索引表
	indexList = list(range(m))
	#误差表
	errorMat = np.zeros((numVal,30))
	#交叉验证numVal次
	for i in range(numVal):
    	trainX = []
    	trainY = []
    	testX = []
    	testY = []
    	#将数据“洗牌”
   		random.shuffle(indexList)
   		#划分训练集与测试集
    	for j in range(m):
        	if j < m*0.9:
            	trainX.append(xArr[indexList[j]])
            	trainY.append(yArr[indexList[j]])
        	else:
            	testX.append(xArr[indexList[j]])
            	testY.append(yArr[indexList[j]])
    	wMat = ridgeTest(trainX,trainY)
		
		#对每次交叉验证,计算三十个lamda的系数
    	for k in range(30):
        	matTestX = np.mat(testX)
        	matTrainX = np.mat(trainX)
        	meanTrain = np.mean(matTrainX,0)
        	varTrain = np.var(matTrainX,0)
        	matTestX = (matTestX - meanTrain)/varTrain
        	yEst = matTestX * np.mat(wMat[k,:]).T + np.mean(trainY)
        	errorMat[i,k] = rssError(yEst.T.A,np.array(testY))
	
	#得到最小误差的系数
	meanErrors = np.mean(errorMat,0)
	minMean = float(min(meanErrors)) 	
	bestWeights = wMat[np.nonzero(meanErrors == minMean)]
	xMat = np.mat(xArr)
	yMat = np.mat(yArr)
	meanX = np.mean(xMat,0)
	varX = np.var(xMat,0)
	#逆标准化数据
	unReg = bestWeights / varX
	print('%f%+f*年份%+f*部件数量%+f*是否全新%+f*原价'%((-1 * np.sum(np.multiply(meanX,unReg))+np.mean(yMat)),unReg[0,0],unReg[0,1],unReg[0,2],unReg[0,3]))

#岭回归测试
def ridgeTest(xArr,yArr):
	xMat = np.mat(xArr)
	yMat = np.mat(yArr).T
	yMean = np.mean(yMat,axis = 0)
	yMat = yMat-yMean
	xMeans = np.mean(xMat,axis = 0)
	xVar = np.var(xMat,axis = 0)
	xMat = (xMat - xMeans)/xVar
	numTestPts = 30
	wMat = np.zeros((numTestPts,np.shape(xMat)[1]))
	for i in range(numTestPts):
    	ws = ridgeRegres(xMat,yMat,np.exp(i-10))
    	wMat[i,:] = ws.T
	return wMat

if __name__ == "__main__":
	lgX = []
	lgY = []
	setDataCollect(lgX,lgY)
	crossValidation(lgX,lgY)

猜你喜欢

转载自blog.csdn.net/JachinMa/article/details/89198011
今日推荐