topic 知乎

using HtmlAgilityPack;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System.Collections;
using System.IO;

namespace EasySpider
{
    public class ReadZhihu
    {
        public static void FormatDocument(string document)
        {
            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(document);
            ////*[@id=\"FreeDefinePlaceholderControl1\"]
            var singleNode = htmlDocument.DocumentNode.SelectSingleNode(".//div[@id=\"zh-topic-organize-page-children\"]");
            var liNodes = singleNode.SelectNodes(".//a[@name=\"topic\"]");
            //foreach (var item in liNodes)
            //{
            //    string name = item.InnerText;
            //    string url = item.GetAttributeValue("href", string.Empty);
            //    WriteData(url, name);
            //}

            //get structure
            var level4 = singleNode.SelectNodes(".//ul/li/ul/li/ul/li/ul/li/a[@name=\"topic\"]");

            foreach (var item in level4)
            {
                //string l4Url = item.GetAttributeValue("href", string.Empty);
                //string l4Text = item.InnerText;

                //var l3Node = item.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]");
                //string l3Url = l3Node.GetAttributeValue("href", string.Empty);
                //string l3Text = l3Node.InnerText;

                //var l2Node = item.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]");
                //string l2Text = l2Node.InnerText;
                //string l2Url = l2Node.GetAttributeValue("href", string.Empty);

                //var l1Node = item.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]");
                //string l1Text = l1Node.InnerText;
                //string l1Url = l1Node.GetAttributeValue("href", string.Empty);
                JArray structure = new JArray();
                //structure.Add(GenereateObject(l1Url, l1Text));
                //structure.Add(GenereateObject(l2Url, l2Text));
                //structure.Add(GenereateObject(l3Url, l3Text));
                //structure.Add(GenereateObject(l4Url, l4Text));

                Stack s = new Stack();
                
                GetParentNode(item, ref s);
                int count = s.Count;
                while(count != 0)
                {
                    structure.Add(s.Pop());
                    count--;
                }
                
                WriteData(structure, @"D:\学科Struct.json");
            }


        }
        public static void GetParentNode(HtmlNode node, ref Stack s)
        {
            string url = node.GetAttributeValue("href", string.Empty);
            string topic = node.InnerHtml;
 
            if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(topic)) return;
            if (s.Count > 0 && ((JObject)s.Peek())["topic"].ToString() == topic) return;
            s.Push(GenereateObject(url, topic));

            if (node.ParentNode != null && node.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]") != null)
            {
                GetParentNode(node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]"), ref s);
            }
            else
                return;
        }
        public static JObject GenereateObject(string url, string topic)
        {
            JObject obj = new JObject();
            obj.Add("topic", topic);
            obj.Add("url", url);
            return obj;
        }

        public static void WriteData(object obj, string fileName)
        {
           

            //JObject QNA = (JObject)question;
            //string qus = QNA["Question"].ToString();
            //string ans = QNA["QuesDetail"].ToString();
            //string anstemp = string.Empty;
            //foreach (var item in ans.Split(new char[] { '\r', '\n' }))
            //{
            //    if (string.IsNullOrEmpty(item) || item.Contains("本页面内容供您参考"))
            //        continue;
            //    anstemp += item.Trim() + " ";
            //}

            //JObject obj = new JObject();
            //obj.Add("Question", qus);
            //obj.Add("Answer", anstemp.Trim());
            //ICBCQNA QNA = (ICBCQNA)question;

            string json = JsonConvert.SerializeObject(obj);

            string QnaPath = fileName;//文件存放路径,保证文件存在

            if (!File.Exists(QnaPath))
            {
                File.Create(QnaPath);

            }

            using (StreamWriter sw = new StreamWriter(QnaPath, true))
            {
                sw.WriteLine(json);
            }


        }
    }
}
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace FormatDocument
{
    class Program
    {
        static void Main(string[] args)
        {
            //read file
            string path = @"D:\学科Struct.json";
            StreamReader sr = new StreamReader(path, Encoding.UTF8);
            String line;
            int i = 0; int j = 0;
            while ((line = sr.ReadLine()) != null)
            {
                Console.WriteLine("------------------readline: {0}------------------",++i);
                WriteData(line, @"D:\topic.json",j);

            }
        }

        public static void WriteData(string row, string fileName, int j)
        {
            Console.WriteLine("-----------write data begin -----------");

            string QnaPath = fileName;//文件存放路径,保证文件存在
            JArray item = JArray.Parse(row);          
              JArray outArray = new JArray();
                if (item.Count >= 4)
                {
                    for (int i = 0; i < 4; i++)
                        outArray.Add(item[i]);
                    string json = JsonConvert.SerializeObject(outArray);
                    string temp = File.ReadAllText(fileName);
                    if (!temp.Contains(json))
                    {
                        using (StreamWriter sw = File.AppendText(fileName))
                        {
                            Console.WriteLine("-----------insert {0} row -----------",++j);
                            sw.WriteLine(json);
                        }
                    }
                }

        }
    }
}

猜你喜欢

转载自www.cnblogs.com/skywss27/p/9991048.html