【记录】JAVA调用Python脚本爬取天猫商品详情图和主图

JAVA代码

@RequestMapping("public/test1")
    @ApiImplicitParam(paramType = "form", dataType = "int", name = "url", value = "商品路径", required = true)
    public void test(HttpServletRequest request){
// String url="https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.3315572e5M4g31&id=547252985200&skuId=3477349595056&areaId=442000&user_id=2202630747&cat_id=2&is_b=1&rn=cbe7ac082a8e99e791c01d5aa2ec966e";
        String url = request.getParameter("url");
        String[] args1 = new String[] { "python", "E:\\python\\project\\test\\test2.py", url}; 
        StringBuilder result = new StringBuilder();
        try {
            Process process = Runtime.getRuntime().exec(args1);
            BufferedReader in = new BufferedReader(new InputStreamReader(
                    process.getInputStream()));
            String line;
            while ((line = in.readLine()) != null) {
                result.append(line);
            }
            in.close();
            process.waitFor();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        System.out.println(result.toString());
        JSONObject json = JSONObject.parseObject(result.toString());
        System.out.println(json.get("DETAIL"));
    }

python脚本(抄的)

import requests

import re, sys, os
import json
import threading
import pprint


class spider:
    def __init__(self, url, name):

        self.url = url
        self.headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
                        "Accept-Encoding": "gzip",
                        "Accept-Language": "zh-CN,zh;q=0.8",
                        "Referer": "http://www.example.com/",
                        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
                        }
        self.name = name

    def openurl(self, url):

        self.request = requests.get(url, headers=self.headers)
        if self.request.ok:
            return self.request.text

    def matchs(self):

        tmall_exp = r"Setup\(([\s\S]+?)\);"  ### 匹配商品数据的正则
        detail = r"src=\"(https://img\S+?[jpgifn]+?)\""  ###匹配 商品详情图的正则
        html = self.openurl(self.url)
        data = re.findall(tmall_exp, html)
        data = json.loads(data[0])
        main_img = data['propertyPics']  ## 这里包括了主图和颜色图的地址
        color_data = data['valItemInfo']['skuList']  ### 这里获得商品的颜色信息列表   包括颜色编码  颜色名称,商品skuID
        detail_html = self.openurl("http:" + data['api']["httpsDescUrl"])
        detail_image = re.findall(detail, detail_html)
        self.newdata = {"MAIN": main_img['default'], "DETAIL": detail_image}

        # psvs = []
        # self.newdata['COLOR'] = []
        #
        # for v in range(len(color_data)):
        #     if ";" in color_data[v]["pvs"]:
        #         psv = color_data[v]['pvs'][color_data[v]['pvs'].find(";") + 1:]
        #     else:
        #         psv = color_data[v]['pvs']
        #     if psv in psvs:
        #         continue
        #     psvs.append(psv)

            # self.newdata['COLOR'].append({color_data[v]["names"]: main_img[";" + psv + ";"]})


        return self.newdata

    # def download(self):
    #     if len(self.newdata) > 0:
    #         for x in range(len(self.newdata['MAIN'])):
    #             threading.Thread(target=self.download_main, args=(self.newdata['MAIN'][x], x)).start()
    #
    #         for x in self.newdata['COLOR']:
    #             threading.Thread(target=self.download_color, args=(x,)).start()
    #         for x in range(len(self.newdata['DETAIL'])):
    #             threading.Thread(target=self.download_detail, args=(self.newdata['DETAIL'][x], x)).start()
    #     return

    # def download_main(self, url, index):
    #     try:
    #         img = requests.get("http:" + url, stream=True, headers=self.headers, timeout=10)
    #
    #     except:
    #         print(sys.exc_info())
    #         return
    #     if img.ok:
    #         if not os.path.exists(self.name + "/main"):
    #             try:
    #                 os.makedirs(self.name + "/main")
    #             except:
    #                 pass
    #         imgs = open(self.name + "/main/%s.jpg" % index, "wb")
    #         imgs.write(img.content)
    #         imgs.close()

    # def download_color(self, url):
    #
    #     try:
    #         img = requests.get("http:" + url[list(url.keys())[0]][0], stream=True, headers=self.headers, timeout=10)
    #
    #     except:
    #         print(sys.exc_info())
    #         return
    #     if img.ok:
    #         if not os.path.exists(self.name + "/color"):
    #             try:
    #                 os.makedirs(self.name + "/color")
    #             except:
    #                 pass
    #         if "/" in list(url.keys())[0]:
    #             color = list(url.keys())[0].replace("/", "_")
    #         elif "\\" in list(url.keys())[0]:
    #             color = list(url.keys())[0].replace("\\", "_")
    #         else:
    #             color = list(url.keys())[0]
    #         imgs = open(self.name + "/color/%s.jpg" % color, "wb")
    #         imgs.write(img.content)
    #         imgs.close()

    # def download_detail(self, url, index):
    #
    #     try:
    #         img = requests.get(url, stream=True, headers=self.headers, timeout=10)
    #     except:
    #         print(sys.exc_info())
    #         return
        # if img.ok:
        #     if not os.path.exists(self.name + "/detail"):
        #         try:
        #             os.makedirs(self.name + "/detail")
        #         except:
        #             pass
        #
        #     imgs = open(self.name + "/detail/%s.jpg" % index, "wb")
        #     imgs.write(img.content)
        #
        #     imgs.close()


if __name__ == "__main__":
    #url="https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.3315572e5M4g31&id=547252985200&skuId=3477349595056&areaId=442000&user_id=2202630747&cat_id=2&is_b=1&rn=cbe7ac082a8e99e791c01d5aa2ec966e"
    url = sys.argv[1]
    taobao = spider(url, "下载图片/T")

    data = taobao.matchs()
    pprint.pprint(data)
    #taobao.download()

其中JAVA调用时有个大坑,就是会报

java java.io.IOException: Cannot run program "python": CreateProcess error=2, 系统找不到指定的文件。

错误

在网上搜了半天,把能用的方法都用了,最后解决了,但不知道哪个方法有效

1、java的rum--run configurations...--Environment---new---Name:PATH;Value:python路径(如:E:\Program Files\Python\Python37)---Apply(或者Run)

2、配置python环境变量,发现无效,因为今天上午刚配的,网上说在MyEclipse启动时配置的Path不生效,需重启MyEclipse,重启后发现居然成了。。

另外,本来还想爬取淘宝详情的数据的,因为刚接触python,连正则都不会写,研究了半天无果,就先放弃了,如果有大佬会的话欢迎指点。

淘宝的话:tmall_exp = r"Setup\(([\s\S]+?)\);" ### 匹配商品数据的正则 

这一句改为 tmall_exp = r"g_config\(([\s\S]+?)\);" ### 匹配商品数据的正则

可以打印匹配后的html看看

猜你喜欢

转载自blog.csdn.net/weixin_42612454/article/details/81704520