ASP.NET_Crawler HtmlAgilityPack

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using HtmlAgilityPack;//引用爬虫DLL
using System.Text;
using DotNet;
using System.Net;
using System.IO;

public partial class _Default : System.Web.UI.Page
{
    
    
    protected void Page_Load(object sender, EventArgs e)
    {
    
    
        kwldg_Reptile();
    }

    //腾讯家居首页(热门产品)爬虫
    private void TencentHome_HotProdcut_Reptile()
    {
    
    
        //抓取地址
        string url = "http://hm.jia360.com/";
        //实例化HtmlWeb对象
        HtmlWeb web = new HtmlWeb();
        //创建html文档,并接受返回参数
        HtmlDocument htmldoc = web.Load(url);//加载url
        //获取li标签下的所有a标签节点
        HtmlNodeCollection aCollection = htmldoc.DocumentNode.SelectNodes("//*[starts-with(@class,'tab_box ')]//li/a");
        //遍历a标签集合
        foreach (var item in aCollection)
        {
    
    
            //获取a标签text
            string title = item.InnerText;
            //获取a标签href
            string href = item.Attributes["href"].Value;
            //获取img标签src
            string imgpath = item.SelectSingleNode("./img/@src").Attributes["src"].Value;
            //图片保存路径
            string SavePath = Server.MapPath("~/upload/link/" + Path.GetFileName(imgpath));
            //下载图片
            WebClient wc = new WebClient();
            wc.DownloadFile(imgpath, SavePath);
            //输出
            Response.Write(title + "<br/>");
            Response.Write(href + "<br/>");
            Response.Write(imgpath + "<br/>");
        }
    }
    //98工作室(知识库页)爬虫
    private void kwldg_Reptile()
    {
    
    
        //抓取地址
        string url = "http://98keji.com/article/article_list.aspx?pn=1";
        //实例化HtmlWeb对象
        HtmlWeb web = new HtmlWeb();
        //创建html文档,并接受返回参数
        HtmlDocument htmldoc = web.Load(url);//加载url
        //获取li标签下的所有a标签节点
        HtmlNodeCollection aCollection = htmldoc.DocumentNode.SelectNodes("//*[starts-with(@class,'article_list ')]//li/a");
        //遍历a标签集合
        foreach (var item in aCollection)
        {
    
    
            //获取a标签text
            string title = item.InnerText;
            //获取a标签href
            string href = item.Attributes["href"].Value;
            //输出
            Response.Write(title + "<br/>");
            Response.Write(href + "<br/>");
        }
    }
}

Guess you like

Origin blog.csdn.net/qq_33285360/article/details/109219395