Jingdong commodity image crawler + keras image classification

JD Commodity Crawler

Carry out a product image crawler from https://list.jd.com/list.html?cat=670%2C671%2C673&page=1&s=57&click=0
. Crawl each of 10,000 product images for mobile phones, pads, notebooks and desktops

#爬虫代码
import re
import requests
from multiprocessing.pool import Pool
from lxml import etree
import time

#爬取每页商品图片url
def crawl(url, page):
    headers = {
    
    
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
    }
    text = requests.get(url, headers=headers).text
    html = etree.HTML(text)
    img_urls = html.xpath("//div[@class='gl-i-wrap']/div[@class='p-img']/a/img/@data-lazy-img")
    img_urls = list(map(lambda url: "http:" + url, img_urls))
    return img_urls

#下载图片到本地方法
def download_img_multipro(img_down_param):
    file_path = "./data/com_img/" + str(img_down_param[0]) +".jpg"
    with open(file_path, "wb") as f:
        f.write(requests.get(img_down_param[1]).content)
        print(file_path + "下载完成")
#主程序
if __name__ == '__main__':
    n = 0
    #循环抓取三百多页
    for i in range(1, 316):
    #构造url
        url = "https://list.jd.com/list.html?cat=670%2C671%2C673&page={}&s=57&click=0".format(i)
        #获取该页的商品url列表
        img_urls = crawl(url, i)
        #定义图片命名数字
        img_count = len(img_urls) + n
        img_name = [j for j in range(n, img_count)]
        n = img_count
        #构造下载图片的实参,存储路径和图片url组成的元组组成的列表
        img_down_param = zip(img_name, img_urls)
        #创建进程池
        pool = Pool(processes=5)
        #启动多进程下载
        pool.map(download_img_multipro, img_down_param)
        #封闭进程池并使主进程阻塞,等待子进程结束
        pool.close()
        pool.join()

Crawl results

Insert picture description here
Insert picture description here
The captured images of the four categories are stored in four folders with the same relative path, representing desktops, notebooks, pads and mobile phones.

Make a data set

Use opencv to read the RGB pixel value of a single image and save it with a vector (note that the pixel value read with opencv here is actually BGR, which is blue, green and red).
All product images are 220 * 220 pixels, three channels.
If a single picture is saved in a vector, the vector dimension is 220 * 220 * 3 = 145200. Obviously this vector is too large.
Here we consider using opencv to compress each picture to 100 * 100. At this time, the required vector dimension is 30000. The acceptance
code is as follows:

def compress_dataset():
    """
    压缩图片数据到100 * 100
    :return:
    """
    #四个迭代器用于遍历四个文件夹
    num_phone = range(9549)
    num_pad = range(9300)
    num_notebook = range(9360)
    num_computer = range(9025)
    globle_num = 0
    #遍历手机图片文件夹
    for i in num_phone:
    #构造图片文件路径+文件名
        file_path = "./dataset/phone_img/" + str(i) + ".jpg"
        #处理异常,同时将无效图片剔除
        try:
        	#读出原图片
            np_3_img = cv2.imread(file_path)
            #压缩图片
            out = cv2.resize(np_3_img, dsize=(100, 100), interpolation=cv2.INTER_AREA)
            #将压缩后的图片写道本地保存
            cv2.imwrite("./dataset/compress_dataset/phone_img/" + str(globle_num) + ".jpg", out)
            globle_num += 1
            print("压缩写入成功", globle_num)
        except:
            print("出现异常")

    globle_num = 0
    for i in num_pad:
        try:
            file_path = "./dataset/pad_img/" + str(i) + ".jpg"
            np_3_img = cv2.imread(file_path)
            out = cv2.resize(np_3_img, dsize=(100, 100), interpolation=cv2.INTER_AREA)
            cv2.imwrite("./dataset/compress_dataset/pad_img/" + str(globle_num) + ".jpg", out)
            globle_num += 1
            print("压缩写入成功")
        except:
            print("出现异常")
	globle_num = 0
    for i in num_notebook:
        try:
            file_path = "./dataset/notebook_com_img/" + str(i) + ".jpg"
            np_3_img = cv2.imread(file_path)
            out = cv2.resize(np_3_img, dsize=(100, 100), interpolation=cv2.INTER_AREA)
            cv2.imwrite("./dataset/compress_dataset/notebook_com_img/" + str(globle_num) + ".jpg", out)
            globle_num += 1
            print("压缩写入成功")
        except:
            print("出现异常")
    globle_num = 0     
    for i in num_computer:
        try:
            file_path = "./dataset/computer_img/" + str(i) + ".jpg"
            np_3_img = cv2.imread(file_path)
            out = cv2.resize(np_3_img, dsize=(100, 100), interpolation=cv2.INTER_AREA)
            cv2.imwrite("./dataset/compress_dataset/computer_img/" + str(globle_num) + ".jpg", out)
            globle_num += 1
            print("压缩写入成功")
        except:
            print("出现异常")

At this time, compress all pictures into 100 * 100 pixels and save them locally
Insert picture description here

Image classification using convolutional neural network

#构建网络
from keras import layers
from keras import models

model = models.Sequential()
#卷积层1 输入张量为图像的像素矩阵
model.add(layers.Conv2D(32,(3,3), activation="relu", input_shape=(110,110,3)))
#池化层1,最大池化
model.add(layers.MaxPooling2D((2,2)))
#卷积层2
model.add(layers.Conv2D(64,(3,3),activation="relu"))
#池化层2
model.add(layers.MaxPooling2D((2,2)))
#卷积层3
model.add(layers.Conv2D(128,(3,3),activation="relu"))
#池化层3
model.add(layers.MaxPooling2D((2,2)))
#卷积层4
model.add(layers.Conv2D(128,(3,3),activation="relu"))
#池化层4
model.add(layers.MaxPooling2D((2,2)))
#将输出张量拉平
model.add(layers.Flatten())
#随机删去此层一般的参数以防止过拟合
model.add(layers.Dropout(0.5))
#全连接层
model.add(layers.Dense(512, activation="relu"))
#输出层,softmax激活,输出概率值
model.add(layers.Dense(4, activation="softmax"))

Compile the network

#编译模型
from keras import optimizers
#使用分类交叉熵损失函数,准确率作衡量指标
model.compile(loss="categorical_crossentropy",
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=["accuracy"]
              )

Data preprocessing

Use the keras image data generator to read the graphics data of the local path, and return the generator to the network training.

#数据预处理
from keras.preprocessing.image import ImageDataGenerator

#存储数据的路径
train_img_dir = "E:/code/python deeplearning/dataset/compress_dataset/train"
validation_img_dir = "E:\\code\\python deeplearning\\dataset\\compress_dataset\\validation"

#数据生成器,将原像素值压缩到0-1之间并对训练数据进行数据增强
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)
test_datagen = ImageDataGenerator(rescale=1./255,)

#训练数据生成器
train_generator = train_datagen.flow_from_directory(
    train_img_dir,
    target_size=(110,110),
    batch_size=128,
    class_mode="categorical"
)

#验证数据生成器
validation_generator = test_datagen.flow_from_directory(
    validation_img_dir,
    target_size=(110,110),
    batch_size=128,
    class_mode="categorical"
)

Insert picture description here
There are more than 30,000 training sets and more than 5,000 verification sets

Model training

Pass the data generator to the model training, the batch size is 128, the training generation has a total of 250 batches, the verification is a total of 40 batches, and a total of 5 rounds of training

history = model.fit_generator(
    train_generator,
    steps_per_epoch=250,
    epochs=5,
    validation_data=validation_generator,
    validation_steps=40
)

Insert picture description here
The results show that the accuracy of the training set is nearly 75%, and the accuracy of the validation set is nearly 90%. The result performed well
. The accuracy of the validation set is higher than that of the training set. The reason is that the model has made strong over-fitting operations, such as randomly deleting general network parameters, making regularization penalties, etc., so the learning speed on the training set is slow. The speed of the model overfitting is also very slow, so it finally performs better on the validation set, which is also in line with the original intention of the model training

Final test model

#生成测试数据
test_datagen = ImageDataGenerator(rescale=1./255,)
test_img_dir = "E:\\code\\python deeplearning\\dataset\\compress_dataset\\test"

test_generator = test_datagen.flow_from_directory(
    test_img_dir,
    target_size=(110,110),
    batch_size=128,
    class_mode="categorical"
)

Incoming test data test

res = model.evaluate_generator(test_generator, steps=40)

Insert picture description here
The results showed that the test accuracy rate reached nearly 90

Guess you like

Origin blog.csdn.net/cyj5201314/article/details/113350144