Use phantomjs headless browser to generate pdf and webpage snapshot instructions

Use phantomjs headless browser to generate pdf and webpage snapshot instructions

What is phantomjs ?

PhantomJS is a no-interface, scriptable WebKit browser engine. It natively supports multiple web standards: DOM manipulation, CSS selectors, JSON, Canvas and SVG.

What can phantomjs do?

Website testing without UI interface
Screen shot
Automation of page actions
Network monitoring

Scenario 1: Generate webpage snapshots

The official recommendation for installation is phantomjs to use version 2.0phantomjs-prebuilt
npm install phantomjs-prebuilt --save

//phantomjs生成图片代码
router.get('/', function (req, res, next) {
    var path = require('path')
    var childProcess = require('child_process')
    var phantomjs = require('phantomjs-prebuilt')
    var fs = require('fs')
    var binPath = phantomjs.path

    var childArgs = [
        path.join(__dirname, '../lib/rasterize.js'), //此文件为官方提供的示例文件,可生成图片或者pdf文件
        'http://www.baidu.com', //将要生成图片的网页地址
        './file.png'
    ]

    childProcess.execFile(binPath, childArgs, function (err, stdout, stderr) {
        // handle results
        if (err || stderr) {
            res.send(500, err || stderr);
            return;
        }

		res.sendFile(path.join(__dirname, './file.png'));
    })
})
//phantomjs将网页生成pdf代码
router.get('/', function (req, res, next) {
    var path = require('path')
    var childProcess = require('child_process')
    var phantomjs = require('phantomjs-prebuilt')
    var fs = require('fs')
    var binPath = phantomjs.path

	childProcess.execFile(binPath, [
			path.join(__dirname, '../lib/rasterize.js'),
			'http://www.baidu.com',
			'./file.pdf',
			'A4'
	], function (err, stdout, stderr) {
			// handle results
			if (err || stderr) {
					res.send(500, err || stderr);
					return;
			}

			res.sendFile(path.join(__dirname, './file.pdf'));
	})
})

phantomjs generates image or pdf code

"use strict";
var page = require('webpage').create(),
    system = require('system'),
    address, output, size, pageWidth, pageHeight;


if (system.args.length < 3 || system.args.length > 5) {
    console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
    console.log('  paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
    console.log('  image (png/jpg output) examples: "1920px" entire page, window width 1920px');
    console.log('                                   "800px*600px" window, clipped to 800x600');
    phantom.exit(1);
} else {
    address = system.args[1];
    output = system.args[2];
    page.viewportSize = { width: 600, height: 600 };
    if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
        size = system.args[3].split('*');
        page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
            : { format: system.args[3], orientation: 'portrait', margin: '0px' };

    } else if (system.args.length > 3 && system.args[3].substr(-2) === "px") {
        size = system.args[3].split('*');
        if (size.length === 2) {
            var pageWidth = parseInt(size[0], 10),
                pageHeight = parseInt(size[1], 10);
            page.viewportSize = { width: pageWidth, height: pageHeight };
            page.ddd = { top: 0, left: 0, width: pageWidth, height: pageHeight };
        } else {
            console.log("size:", system.args[3]);
            var pageWidth = parseInt(system.args[3], 10),
                pageHeight = parseInt(pageWidth * 3 / 4, 10); // it's as good an assumption as any
            console.log("pageHeight:", pageHeight);
            page.viewportSize = { width: pageWidth, height: pageHeight };
        }
    }
    if (system.args.length > 4) {
        page.zoomFactor = system.args[4];
    }
    page.settings.loadImages = true;
    page.settings.userAgent = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36';

    page.open(address, function (status) {
        if (status !== 'success') {
            console.log('Unable to load the address!');
            phantom.exit(1);
        } else {
            window.setTimeout(function () {
                page.render(output);
                phantom.exit();
            }, 200);
        }
    });
}

material

webpage module

The webpage module is the core module of PhantomJS for webpage operations.

var webPage = require('webpage');
var page = webPage.create();

The above code means to load the webpage module of PhantomJS and create an instance.

The following is an introduction to the properties and methods of the webpage instance.

open()

The open method is used to open a specific web page.

var page = require('webpage').create();

page.open('http://slashdot.org', function (s) {
  console.log(s);
  phantom.exit();
});

In the above code, the open() method is used to open a specific web page. It accepts two parameters. The first parameter is the URL of the web page, here is the famous news website Slashdot, the second parameter is the callback function, the function will run after the page is opened, its parameter is a string representing the status, if the opening is successful, it is success, otherwise it is fail.

Note that PhantomJS will report that the page opened successfully as long as it receives the result returned by the server, regardless of whether the server returns a 404 or 500 error.

The open method uses the GET method by default to communicate with the server, but other methods can also be used.

var webPage = require('webpage');
var page = webPage.create();
var postBody = 'user=username&password=password';

page.open('http://www.google.com/', 'POST', postBody, function(status) {
  console.log('Status: ' + status);
  // Do other things here...
});

In the above code, the POST method is used to send data to the server. The second parameter of the open method is used to specify the HTTP method, and the third parameter is used to specify the data to be used by the method.

The open method also allows providing a configuration object for more detailed configuration of the HTTP request.

var webPage = require('webpage');
var page = webPage.create();
var settings = {
  operation: "POST",
  encoding: "utf8",
  headers: {
    "Content-Type": "application/json"
  },
  data: JSON.stringify({
    some: "data",
    another: ["custom", "data"]
  })
};

page.open('http://your.custom.api', settings, function(status) {
  console.log('Status: ' + status);
  // Do other things here...
});

evaluate()

The evaluate method is used to execute JavaScript code in the page after opening the page.

var page = require('webpage').create();

page.open(url, function(status) {
  var title = page.evaluate(function() {
    return document.title;
  });
  console.log('Page title is ' + title);
  phantom.exit();
});

The console statement inside the webpage and the console statement inside the evaluate method are not displayed on the command line by default. At this time, the onConsoleMessage callback function can be used, and the above example can be rewritten as follows.

var page = require('webpage').create();

page.onConsoleMessage = function(msg) {
  console.log('Page title is ' + msg);
};

page.open(url, function(status) {
  page.evaluate(function() {
    console.log(document.title);
  });
  phantom.exit();
});

In the above code, there is a console statement inside the evaluate method, which will not be output on the command line by default. At this time, you can use the onConsoleMessage method to listen to this event and process it.

includeJs()

The includeJs method is used to load external scripts on the page, and calls the specified callback function after loading.

var page = require('webpage').create();
page.open('http://www.sample.com', function() {
  page.includeJs("http://path/to/jquery.min.js", function() {
    page.evaluate(function() {
      $("button").click();
    });
    phantom.exit()
  });
});

The above example injects jQuery script into the page and clicks all the buttons. It should be noted that since it is loaded asynchronously, the phantom.exit()statement should be placed in page.includeJs()the callback function of the method, otherwise the page will exit prematurely.

render()

The render method is used to save the web page as an image, and the parameter is the specified file name. This method saves web pages in different formats according to the suffix name, currently supports PNG, GIF, JPEG and PDF.

var webPage = require('webpage');
var page = webPage.create();

page.viewportSize = { width: 1920, height: 1080 };
page.open("http://www.google.com", function start(status) {
  page.render('google_home.jpeg', {format: 'jpeg', quality: '100'});
  phantom.exit();
});

This method can also accept a configuration object. The format field is used to specify the image format, and the quality field is used to specify the image quality. The minimum value is 0 and the maximum value is 100.

viewportSize,zoomFactor

The viewportSize property specifies the size of the browser's viewport, which is the initial browser window size when the web page is loaded.

var webPage = require('webpage');
var page = webPage.create();

page.viewportSize = {
  width: 480,
  height: 800
};

The Height field of viewportSize must be specified and cannot be omitted.

The zoomFactor property is used to specify the zoom factor of the page during rendering (render method and renderBase64 method). The default is 1 (ie 100%).

var webPage = require('webpage');
var page = webPage.create();

page.zoomFactor = 0.25;
page.render('capture.png');

onResourceRequested

The onResourceRequested attribute is used to specify a callback function that will be triggered when the page requests a resource. Its first parameter is the metadata object of the HTTP request, and the second parameter is the issued network request object.

The HTTP request includes the following fields.

field illustrate
id The number of the requested resource
method HTTP method used
url The requested resource URL
time a Date object containing the requested time
headers Array of HTTP headers

The network request object contains the following methods

method illustrate
abort() Terminates the current network request, which causes the onResourceError callback function to be called.
changeUrl(newUrl) Change the URL of the current web request.
setHeader(key, value) Set HTTP header information.
var webPage = require('webpage');
var page = webPage.create();

page.onResourceRequested = function(requestData, networkRequest) {
  console.log('Request (#' + requestData.id + '): ' + JSON.stringify(requestData));
};

onResourceReceived

The onResourceReceived attribute is used to specify a callback function, which will be executed when the webpage receives the requested resource. Its parameter is the metadata object of the HTTP response sent by the server, including the following fields.

field illustrate
id Requested resource number
url URL of the requested resource r-time: Date object containing the HTTP response time
headers Array of HTTP headers
bodySize Received content size after decompression
contentType Type of content received
redirectURL Redirect URL (if any)
stage For a multi-block HTTP response, the first block is start and the last block is end.
status HTTP status code, 200 on success.
statusText HTTP status information, such as OK.

If the HTTP response is very large and is sent in multiple chunks, onResourceReceived will trigger a callback function when each chunk is received.

var webPage = require('webpage');
var page = webPage.create();

page.onResourceReceived = function(response) {
  console.log('Response (#' + response.id + ', stage "' + response.stage + '"): ' + JSON.stringify(response));
};

system module

The system module can load operating system variables, and system.args is an array of parameters.

var page = require('webpage').create(),
    system = require('system'),
    t, address;

// 如果命令行没有给出网址
if (system.args.length === 1) {
    console.log('Usage: page.js <some URL>');
    phantom.exit();
}

t = Date.now();
address = system.args[1];
page.open(address, function (status) {
    if (status !== 'success') {
        console.log('FAIL to load the address');
    } else {
        t = Date.now() - t;
        console.log('Loading time ' + t + ' ms');
    }
    phantom.exit();
});

The method of use is as follows:

$ phantomjs page.js http://www.google.com
{{o.name}}
{{m.name}}

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=324128066&siteId=291194637