Java uses phantomjs page to find highlighted screenshots

background

Some time ago, the project needed to implement a function: according to the provided data (website URL and corresponding typos). Highlight typos on web pages and automatically take screenshots to collect evidence.

experience

At first, for some reasons, the combination of cdp4j+Robot was temporarily used, through Ctrl+ F, and then Ctrl+ C/Vto simulate manual page search for screenshots.
Disadvantages:
1. Chrome needs to be installed on the server;
2. When taking screenshots, the current Chrome window needs to be kept at the top level, otherwise the typos that need to be found will be pasted to other places;
3. Due to the second limitation, multiple threads cannot be started Screenshot at the same time;

part of the code:

@Override
public void run(){
    Launcher launcher = null;
    Robot robot = null;
    OutputStream out = null;
    Session session = null;
    try{
        launcher = new Launcher();
        SessionFactory factory = launcher.launch();
        session = factory.create();
        session.navigate("需要截图的网站URL");
        session.waitDocumentReady(30000);//最多等待30秒渲染

        robot = new Robot();
        robot.keyPress(KeyEvent.VK_CONTROL);
        robot.keyPress(KeyEvent.VK_F);
        robot.keyRelease(KeyEvent.VK_F);
        robot.keyRelease(KeyEvent.VK_CONTROL);
        Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard();
        Transferable transferable = new StringSelection("需要查找的错别字");
        clipboard.setContents(transferable, null);
        robot.keyPress(KeyEvent.VK_CONTROL);
        robot.keyPress(KeyEvent.VK_V);
        robot.keyRelease(KeyEvent.VK_V);
        robot.keyRelease(KeyEvent.VK_CONTROL);
        robot.keyPress(KeyEvent.VK_ENTER);
        robot.keyRelease(KeyEvent.VK_ENTER);

            byte[] png = session.captureScreenshot();
            if(png!=null && png.length>0){
                File file = new File(filePath);
                if(!file.mkdirs()){
                    file = new File(filePath);
                }
                String path = filePath + "/"+UUID.randomUUID() + ".png";
                file = new File(path);
                out = new FileOutputStream(file);
                out.write(png);
                out.flush();
                //自己的业务
                .....
            }
    } catch (Exception e) {
        e.printStackTrace();
    }finally {
    	if(session!=null){
            session.close();
        }
   		//关闭当前窗口
        //if(robot!=null){
        //    robot.keyPress(KeyEvent.VK_CONTROL);
        //    robot.keyPress(KeyEvent.VK_W);
        //    robot.keyRelease(KeyEvent.VK_W);
        //    robot.keyRelease(KeyEvent.VK_CONTROL);
        //}
        //结束进程
        if(launcher!=null){
            launcher.getProcessManager().kill();
        }
        //关闭流
        if(out!=null){
            try {
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

current method

Use phantomjs to take screenshots. Two things are needed: 1.phantomjs.exe; 2.js script (screenshot.js here)
benefit:
1. phantomjs is a headless browser. During the screenshot process, the browser page will not pop up, and the operation of the server will not affect it;
Phantomjs official website . Download and unzip the exe file to the specified directory.
2. Multiple threads can take screenshots at the same time;
3. Can cut long pictures;

Directory Structure
java code:

    @Override
    public void run() {
        String projectPath = Thread.currentThread().getContextClassLoader().getResource("").getPath();
        String needPath = projectPath.substring(1, projectPath.length() - 16);
        String path = needPath + "phantomjs/";
        String file_name = UUID.randomUUID() + ".jpg";
        String new_file = AppConfig.WORDSSCREENSHOT_DIR + "/" + file_name;//截图保存路径

        String phantomjsExePath = path + "phantomjs.exe";//phantomjs在项目中的路径
        String codejsPath = path + "screenshot.js";//js的路径
        Runtime rt = Runtime.getRuntime();
        Process process = null;
        InputStream is = null;
        try {
        	//url为网站URL,word 需要查找的错别字;每个参数必须用空格隔开
            process = rt.exec(phantomjsExePath + " " + codejsPath + " "
                    + url.trim() + " " + new_file + " " + word);
            is = process.getInputStream();
            BufferedReader br = new BufferedReader(new InputStreamReader(is));
            StringBuffer sbf = new StringBuffer();
            String tmp = "";
            while ((tmp = br.readLine()) != null) {
                sbf.append(tmp);
            }
            is.close();

            System.out.println("url:" + url + " -->截图结束");
            File screenshot = new File(new_file);
            if (screenshot.exists()) {//截图成功
                //自己的业务
                ...
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (process != null) {
                process.destroy();
            }
        }
    }

js code:

/**
 * Created by RYK on 2018/5/24.
 */
var page = require('webpage').create();
system = require('system');

page.viewportSize = {
    width : 1024,
    height : 800
};
page.settings = {
    javascriptEnabled : true,
    loadImages : true,
    userAgent : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/20.0',
    resourceTimeout:60*1000
};
var address,filePath,value;
if(system.args.length < 4){
    phantom.exit();
}else{
    address = system.args[1];//网站URL
    filePath = system.args[2];//图片保存路径
    value = system.args[3];//需要查找的错别字
    page.open(address, function (status){
        if (status != "success"){
            console.log('FAIL to load the address');
            phantom.exit();
        }
        page.evaluate(function(s){
            window.scrollTo(0,10000);//滚动到页面底部
            window.setTimeout(function(){//将页面上匹配到的文字,加上背景色并进行替换
                var body = document.body;
                var contents = body.innerHTML;
                var reg = new RegExp(s, 'g')
                contents = contents.replace(reg, '<span style="background:yellow;">' + s + '</span>')
                document.body.innerHTML = contents
            },1000);
        },value);
        window.setTimeout(function (){
            page.render(filePath);//截图保存
            phantom.exit();
        }, 5000);
    });
}

insufficient

When testing, it takes a long time to take a screenshot of a single website. The reason is unknown. The formal environment can be compensated with multithreading.

illustrate

This is my first article, please correct me.
Thanks to the highlighting of the front-end colleagues for guidance.

Guess you like

Origin blog.csdn.net/AD_Marcelo/article/details/88867384