c# http request ajax page

  When we use Http request, some pages are loaded by ajax, so the requested page data is incomplete. That is to say, where ajax loads data locally, we cannot request it. What should we do at this time?

  The combination of WebDriver + phantomjs works together to accomplish this task. Briefly introduced respectively, WebDriver is a front-end automated testing framework, and phantomjs is a non-interface browser based on webkit. WebDriver calls phantomjs.exe to work. Below is the API provided by WebDriver, which seems to be able to drive various browsers to work.

        

  Preparation before use:

       On Nuget, download the two packages Selenium.WebDriver and Selenium.PhantomJS.WebDriver , reference WebDriver.dll in the project, and have phantomjs.exe in the output directory.

  Let's look at a complete example:

  

using OpenQA.Selenium;
using OpenQA.Selenium.PhantomJS;
using OpenQA.Selenium.Support.UI;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;

namespace ConsoleApplication1
{
    public interface ICrawler
    {
        event EventHandler<OnStartEventArgs> OnStart;
        event EventHandler<OnCompletedEvent> OnCompleted;
        event EventHandler<OnErrorEventArgs> OnError;

        Task Start(Uri uri, Script script, Operation opreation);
    }

    public class Operation
    {

        public Action<PhantomJSDriver> Action;

        public Func<IWebDriver, bool> Condition;

        public int timeout { get; set; }
    }

    public class Script
    {
        public string Code { set; get; }

        public object[] Args { set; get; }

    }

    public class OnStartEventArgs
    {
        public Uri Uri { set; get; }

        public OnStartEventArgs(Uri uri)
        {
            this.Uri = uri;
        }
    }

    public class OnErrorEventArgs
    {
        public Uri Uri { set; get; }

        public Exception Exception { set; get; }

        public OnErrorEventArgs(Uri uri, Exception ex)
        {
            this.Uri = uri;

            this.Exception = ex;
        }
    }



    public  class OnCompletedEvent
    {
        public Uri Uri { set; get; }

        public int ThreadId { set; get; }

        public string PageSource { get; private set; }

        public long Milliseconds { get; private set; }

        public PhantomJSDriver Driver { get; private set; }

        public OnCompletedEvent(Uri uri, int threadId, string pageSource, long milliseconds, PhantomJSDriver driver)
        {
            this.Uri = uri;
            this.ThreadId = threadId;
            this.PageSource = pageSource;
            this.Milliseconds = milliseconds;
            this.Driver = driver;
        }
    }

    public class HighCrawler : ICrawler
    {

        public event EventHandler<OnStartEventArgs> OnStart;

        public event EventHandler<OnCompletedEvent> OnCompleted;

        public event EventHandler<OnErrorEventArgs> OnError;

        private static PhantomJSOptions _options;
        private static PhantomJSDriverService _service;


        static HighCrawler()
        {
            var service = PhantomJSDriverService.CreateDefaultService();
            service.DiskCache = true;
            service.IgnoreSslErrors = true;
            service.HideCommandPromptWindow = true;
            service.LoadImages = false;
            service.LocalToRemoteUrlAccess = true;

            _service = service;

            _options = new PhantomJSOptions();
        }


        public Task Start(Uri uri, Script script, Operation operation)
        {
            return Task.Factory.StartNew(() =>
            {
                if (OnStart != null)
                {
                    this.OnStart(this, new OnStartEventArgs(uri));
                }

                var driver = new PhantomJSDriver(_service, _options);
                try
                {
                    var watch = DateTime.Now;
                    driver.Navigate().GoToUrl(uri.ToString());

                    if (script != null)

                        driver.ExecuteScript(script.Code, script.Args);

                    if (operation.Action != null) operation.Action.Invoke(driver);

                    var driverWait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(operation.timeout));   // Set the timeout

                    if (operation.Condition != null) driverWait.Until(operation.Condition);

                    var threadId = Thread.CurrentThread.ManagedThreadId;

                    var milliseconds = DateTime.Now.Subtract(watch).Milliseconds;

                    var pageSource = driver.PageSource;

                    if (this.OnCompleted != null)
                        this.OnCompleted(this, new OnCompletedEvent(uri, threadId, pageSource, milliseconds, driver));

                }
                catch (Exception ex)
                {
                    if (OnError != null)
                        this.OnError(this, new OnErrorEventArgs(uri, ex));
                }
                finally
                {
                    driver.Close();
                    driver.Quit();
                }
            });
        }
    }
}

  This is a class that encapsulates it for easy use. Let's see how to use it:

        ///  <summary> 
        /// Parse the website
         ///  </summary> 
        ///  <param name="url"> Website to be resolved </param> 
        ///  <param name="waitId"> Waiting to load Element Id: "search-main" </param> 
        ///  <param name="xpath"> Analysis path: "//div[@class=\"article panel article-result\"]//h5[@ class=\"title\"]//a" </param> 
        private  static  void TestWaitForReady( string url, string waitId, string xpath, int timeout = 10000 )
        {

            var crawler = new HighCrawler();

            crawler.OnStart += (s, e) =>
            {

                Console.WriteLine( "The crawler starts to grab the address: " + e.Uri.ToString());
            };

            crawler.OnError += (s, e) =>
            {
                Console.WriteLine( " An error occurred in the crawler: " + e.Uri.ToString() + " , exception information " + e.Exception.ToString());
            };

            crawler.OnCompleted += (s, e) =>
            {
                Console.WriteLine( " Length of source received: " + e.PageSource.Length);

                Thread.Sleep(1000);
                Console.WriteLine( " Crawler finished, time spent: " + e.Milliseconds);
                 var items = e.Driver.FindElements(By.XPath(xpath));

                foreach (var item in items)
                {
                    Console.WriteLine(item.Text);
                }
            };

            var operation = new Operation
            {
                Action = (x) =>
                {

                },
                Condition = (x) =>
                {
                    return x.FindElement(By.Id(waitId)).Displayed;
                },
                timeout = timeout
            };

            crawler.Start(new Uri(url), null, operition);

        }

  The core principle of taking ajax asynchronous results: WebDriver uses an element on the page as an identifier. Once this element appears, it indicates that the ajax is over, and then the result is returned, and there is a waiting process in the middle.

 

  

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325344293&siteId=291194637