C # HtmlAgilityPack crawling static pages

Recently very interested in reptiles, study a little bit, use HtmlAgilityPack produced a very simple reptiles, reptile can only get this simple Html static pages

HtmlAgilityPack Profile

HtmlAgilityPack is a parsing speed is very fast, and open source Html analytical tools, and support the use of Xpath HtmlAgilityPack resolve Html, Html document can help us resolve like parsing Xml document as easy and convenient.

C # installation HtmlAgilityPack

  1. If VS is installed Nuget, can be installed directly in Nuget search.
  2. Unzip the downloaded files have three here just to them HtmlAgilityPack.dll, HtmlAgilityPack.xml can be used to introduce solutions

Examples of (acquiring a page image)

HTML page to load

//从网页中加载
string url = "https://www.bilibili.com";
HtmlWeb web = new HtmlWeb();
HtmlDocument hd = web.Load(url);

Use WebClient to write a picture downloader

Needs using System.Netandusing System.IO

/// <summary>
/// 图片下载器
/// </summary>
public class ImgDownloader
{
    /// <summary>
    /// 下载图片
    /// </summary>
    /// <param name="webClient"></param>
    /// <param name="url">图片url</param>
    /// <param name="folderPath">文件夹路径</param>
    /// <param name="fileName">图片名</param>
    public static void DownloadImg(WebClient webClient, string url, string folderPath, string fileName)
    {
        //如果文件夹不存在,则创建一个
        if (!Directory.Exists(folderPath))
        {
            Directory.CreateDirectory(folderPath);
        }
        //判断路径是否完整,补全不完整的路径
        if (url.IndexOf("https:") == -1 && url.IndexOf("http:") == -1)
        {
            url = "https:" + url;
        }
        //下载图片
        try
        {
            webClient.DownloadFile(url, folderPath + fileName);
            Console.WriteLine(fileName + "下载成功");
        }
        catch (Exception ex)
        {
            Console.Write(ex.Message);
            Console.WriteLine(url);
        }
    }
}

Get the img tag pictures by Xpath

string imgPath = "//img";//选择img
int imgNum = 0;//图片编号
//获取img标签中的图片
foreach (HtmlNode node in hd.DocumentNode.SelectNodes(imgPath))
{
    if (node.Attributes["src"] != null)
    {
        string imgUrl = node.Attributes["src"].Value.ToString();
        if (imgUrl != "" && imgUrl != " ")
        {
            imgNum++;
            //生成文件名,自动获取后缀
            string fileName = imgNum + imgUrl.Substring(imgUrl.LastIndexOf("."));
            ImgDownloader.DownloadImg(wc, imgUrl, "images/", fileName);
        }
    }
}

Get background by Xpath

//获取背景图
string bgImgPath = "//*[@style]";//选择具有style属性的节点
foreach (HtmlNode node in hd.DocumentNode.SelectNodes(bgImgPath))
{
    if (node.Attributes["style"].Value.Contains("background-image:url"))
    {
        imgNum++;
        string bgImgUrl = node.Attributes["style"].Value;
        bgImgUrl = Regex.Match(bgImgUrl, @"(?<=\().+?(?=\))").Value;//读取url()的内容
        //Console.WriteLine(bgImgUrl);
        //生成文件名,自动获取后缀
        string fileName = imgNum + bgImgUrl.Substring(bgImgUrl.LastIndexOf("."));

        ImgDownloader.DownloadImg(wc, bgImgUrl, "images/bgcImg/", fileName);
    }
}

The complete code

using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net;
using System.IO;
using HtmlAgilityPack;
using System.Text.RegularExpressions;

namespace WebCrawlerDemo
{
    class Program
    {
        static void Main(string[] args)
        {
            WebClient wc = new WebClient();
            

            string url = "https://www.bilibili.com";
            HtmlWeb web = new HtmlWeb();
            HtmlDocument hd = web.Load(url);//下载html页面

            string imgPath = "//img";//选择img
            
            int imgNum = 0;//图片编号
            
            //获取img标签中的图片
            foreach (HtmlNode node in hd.DocumentNode.SelectNodes(imgPath))
            {
                if (node.Attributes["src"] != null)
                {
                    string imgUrl = node.Attributes["src"].Value.ToString();
                    if (imgUrl != "" && imgUrl != " ")
                    {
                        imgNum++;
                        //生成文件名,自动获取后缀
                        string fileName = imgNum + imgUrl.Substring(imgUrl.LastIndexOf("."));

                        ImgDownloader.DownloadImg(wc, imgUrl, "images/", fileName);
                    }
                }
            }
            //获取背景图
            string bgImgPath = "//*[@style]";//选择具有style属性的节点
            foreach (HtmlNode node in hd.DocumentNode.SelectNodes(bgImgPath))
            {
                if (node.Attributes["style"].Value.Contains("background-image:url"))
                {
                    imgNum++;
                    string bgImgUrl = node.Attributes["style"].Value;
                    bgImgUrl = Regex.Match(bgImgUrl, @"(?<=\().+?(?=\))").Value;//读取url()的内容
                    //生成文件名,自动获取后缀
                    string fileName = imgNum + bgImgUrl.Substring(bgImgUrl.LastIndexOf("."));

                    ImgDownloader.DownloadImg(wc, bgImgUrl, "images/bgcImg/", fileName);
                }
            }

            Console.WriteLine("----------END----------");
            Console.ReadKey();
        }
    }
    /// <summary>
    /// 图片下载器
    /// </summary>
    public class ImgDownloader
    {
        /// <summary>
        /// 下载图片
        /// </summary>
        /// <param name="webClient"></param>
        /// <param name="url">图片url</param>
        /// <param name="folderPath">文件夹路径</param>
        /// <param name="fileName">图片名</param>
        public static void DownloadImg(WebClient webClient, string url, string folderPath, string fileName)
        {
            //如果文件夹不存在,则创建一个
            if (!Directory.Exists(folderPath))
            {
                Directory.CreateDirectory(folderPath);
            }
            //判断路径是否完整,补全不完整的路径
            if (url.IndexOf("https:") == -1 && url.IndexOf("http:") == -1)
            {
                url = "https:" + url;
            }
            //下载图片
            try
            {
                webClient.DownloadFile(url, folderPath + fileName);
                Console.WriteLine(fileName + "下载成功");
            }
            catch (Exception ex)
            {
                Console.Write(ex.Message);
                Console.WriteLine(url);
            }
        }
    }
}

Reference article

Guess you like

Origin www.cnblogs.com/xueyubao/p/11462169.html
Recommended