程序中得到百度的搜索结果


using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Net;
using System.IO;
namespace baiduRobot
{
    struct BaiduEntry
    {
        public string title, brief, link;
    }
    class Program
    {
        static string GetHtml(string keyword)
        {
            string url = @"http://www.baidu.com/";
                      
            string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936));
            //百度使用codepage 936字符编码来作为查询串,果然专注于中文搜索……
            //更不用说,还很喜欢微软
            //谷歌能正确识别UTF-8编码和codepage这两种情况,不过本身网页在HTTP头里标明是UTF-8的
            //估计谷歌也不讨厌微软(以及微软的专有规范)
            string query = "s?wd=" + keyword;

            HttpWebRequest req;
            HttpWebResponse response;
            Stream stream;
            req = (HttpWebRequest)WebRequest.Create(url + query);
            response = (HttpWebResponse)req.GetResponse();
            stream = response.GetResponseStream();
            int count = 0;
            byte[] buf = new byte[8192];
            string decodedString = null;
            StringBuilder sb = new StringBuilder();
            try
            {
                Console.WriteLine("正在读取网页{0}的内容……", url + query);
                do
                {
                    count = stream.Read(buf, 0, buf.Length);
                    if (count > 0)
                    {
                        decodedString = Encoding.GetEncoding(936).GetString(buf, 0, count);
                        sb.Append(decodedString);
                    }
                } while (count > 0);
            }
            catch
            {
                Console.WriteLine("网络连接失败,请检查网络设置。");
            }
            return sb.ToString();
        }
        static void PrintResult(List<BaiduEntry> entries)
        {
            int count = 0;
            entries.ForEach(delegate(BaiduEntry entry)
            {
                Console.WriteLine("找到了百度的第{0}条搜索结果:", count += 1);
                if (entry.link != null)
                {
                    Console.WriteLine("找到了一条链接:");
                    Console.WriteLine(entry.link);
                }
                if (entry.title != null)
                {
                    Console.WriteLine("标题为:");
                    Console.WriteLine(entry.title);
                }
                if (entry.brief != null)
                {
                    Console.WriteLine("下面是摘要:");
                    Console.WriteLine(entry.brief);
                }
                Program.Cut();
            });
        }
        static void simpleOutput()
        {
            string html = "<table><tr><td><font>test</font><a>hello</a><br></td></tr></table>";
            Console.WriteLine(RemoveSomeTags(html));
        }
        static string RemoveVoidTag(string html)
        {
            string[] filter = { "<br>" };
            foreach (string tag in filter)
            {
                html = html.Replace(tag, "");
            }
            return html;
        }
        static string ReleaseXmlTags(string html)
        {
            string[] filter = { "<a.*?>", "</a>", "<em>", "</em>", "<b>", "</b>", "<font.*?>", "</font>" };
            foreach (string tag in filter)
            {
                html = Regex.Replace(html, tag, "");
            }
            return html;
        }

        static string RemoveSomeTags(string html)
        {
            html = RemoveVoidTag(html);
            html = ReleaseXmlTags(html);
            return html;
        }
        static void Cut()
        {
            Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
        }
        static void MainProc(string input)
        {
            MainProc(input, false);
        }
        static void MainProc(string input, bool tagsForBrief)
        {
            Regex r = new Regex("<table*</table>", RegexOptions.IgnoreCase);
            //提取出(<table>,</table>)对,并等待进一步处理。
            Match m = r.Match(input);
            List<string> collection = new List<string>();
            while (m.Success)
            {
                collection.Add(m.Value);
                //找出tagname为table的节点并存储到collection变量中
                m = m.NextMatch();
            }
            List<BaiduEntry> entries = new List<BaiduEntry>();
            collection.ForEach(delegate(string entry)
            {
                r = new Regex("<td.*?>(.*)</td>", RegexOptions.IgnoreCase);
                m = r.Match(entry);
                while (m.Success)
                {
                    //Console.WriteLine(m.Value);

                    GroupCollection gc = m.Groups;
                    // Console.WriteLine(gc[0].Captures[0].Value == gc[0].Value);
                    for (int i = 1; i < gc.Count; i++)
                    {//放弃第一个group,那里只有整个match字符串,而且永远只有这1个捕获组(gc[0].Captures.Count恒为1)
                        Capture result = gc[i].Captures[0];//正则对象r里只有1个分组,所以只需要提取第一个分组就可以了。
                        string html = result.Value;
                        //result里存储着td节点的innerHTML,那里有真正的搜索结果
                        BaiduEntry baidu = new BaiduEntry();
                        r = new Regex("<a.*?href=\"(.*?)\".*?>", RegexOptions.IgnoreCase);
                        if (r.IsMatch(html))
                        {
                            string linkString = r.Match(html).Groups[1].Captures[0].Value;
                            baidu.link = linkString;
                        }
                        r = new Regex("<font.*</font>");
                        //td节点下有一些嵌套了2层的font标签,把这个大的font标签拿下来。
                        html = r.Match(html).Value;//现在html变量里存储着比较浓缩的信息了。

                        r = new Regex("<font.*?>(.*?)</font>");
                        Match contentMatch = r.Match(html);
                        if (contentMatch.Success)
                        {
                            //Console.WriteLine(html);
                            string title = contentMatch.Groups[1].Captures[0].Value;
                            title = RemoveSomeTags(title);
                            baidu.title = title;
                            contentMatch = contentMatch.NextMatch();
                            if (contentMatch.Success)
                            {
                                string brief = contentMatch.Groups[1].Captures[0].Value;
                                int splitIndex = brief.IndexOf("<font");
                                if (splitIndex > -1)
                                    brief = brief.Substring(0, splitIndex);
                                if (!tagsForBrief)
                                    brief = RemoveSomeTags(brief);
                                //如果不需要带有HTML格式的摘要,那么就处理掉HTML标签
                                baidu.brief = brief;
                            }
                        }
                        else
                        {
                            if (html == "") continue;
                            Console.WriteLine("怪了,这里没有找到任何结果。");
                            Console.WriteLine("如果百度已经更改了页面的结构那么程序需要重新设计。");
                            Console.WriteLine("Mark:");
                            Console.WriteLine(html);
                            Cut();
                            Cut();
                            Cut();
                        }
                        //Console.WriteLine(html);
                        //Program.Cut();
                        entries.Add(baidu);
                    }
                    m = m.NextMatch();
                }
            });

            PrintResult(entries);
        }
        public static void Main(string[] args)
        {
            Console.WriteLine("请输入一个关键字。");
            string keyword;
            keyword = Console.ReadLine();
            Console.WriteLine("正在从百度上获取结果,请稍等……");
            string input;
            input = GetHtml(keyword);
            Regex r = new Regex("<table.*class=\"result\"[\\s\\S]*</table>", RegexOptions.IgnoreCase);
            input = r.Match(input).Value;
            MainProc(input);
            Console.ReadKey(true);
        }
    }
}


猜你喜欢

转载自blog.csdn.net/seamonkey/article/details/6957080
今日推荐