c++ 爬取小说内容(未完)

额,还在尝试,socket编程还没写,以及函数没封装,问题很多。。。

先写了个获取小说目录的,顺便学一哈regex,结果只会用了(.*)以及regex_search。。

/******************version 0.0.1***************/
 
 
// name: getChapterAddress.cpp
// using: get the address of chapter from catalogue of novel
// author:amos
// time:2018/5/4
#include <bits/stdc++.h>

using namespace std;

int main()
{
    ifstream fp("/home/calco/桌面/novel.txt");
    ofstream ft ("/home/calco/桌面/url.txt");
    if(!fp.is_open())
    {
        std::perror("File open is failed");
        return EXIT_FAILURE;
    }
    if(!ft.is_open())
    {
        std::perror("urltxt open is failed");
        return EXIT_FAILURE;
    }
    char temp[500];
    set<string> se;
    while(fp.getline(temp,500,'\n'))
    {
        string s = temp;
        std::smatch m;
        std::regex e ("class=\"novel_num\"\>\</div\>\<a href=\"(.+).html");

        while (std::regex_search (s,m,e)) {
          int si = se.size();
          se.insert(m[1]);
          int s1= se.size();
          if(si != s1)
            ft<<m[1]<<endl;
          s = m.suffix().str();
        }

    }

    fp.close();
    ft.close();
    return 0;
}

大概就是这样的html


处理完之后就是



/*********************************************************************************************/

然后写了对小说的内容的获取,去除了</br >这类,以及&bnsp;

// name: getChapterContent.cpp
// using: get the content of novel
// author:amos
// time:2018/5/4

#include <bits/stdc++.h>

using namespace std;

string HandleString1(const string A)
{
    string b = A;
    string ctr = "nbsp; ";//CSDN这个博客好像是html来展现的所以这里应该是少了个&()nbsp;ps:没有(),打不出
    string ctr1 = "<br />";         //讲道理ctr这个有问题,应该是&&,当时没发现导致代码贼混乱,改!
    string en = " ";
    string::size_type pos = 0;
//    string::size_type bLen = b.size();
    string::size_type changeLen = ctr.size();
    string::size_type enLend = en.size();

    /*change nbsp; to ' '*/
    while((pos=(b.find(ctr,pos)) != (string::npos)))
    {
        b.replace(pos-1,changeLen,en);
        pos += (enLend);
    }//处理完后发现有sp;残留..就又写了去除sp;的,估计是上面</br >那里出了问题
    while((pos=(b.find("sp;",pos)) != (string::npos)))
    {
        b.replace(pos,3," ");
        pos += 3;
    }
    /* change <br /> to ' '*/
    return b;
}

string HandleString(string A)   //处理</br>...用了最笨的方法
{
    string test_str = HandleString1(A);
    int strleng = test_str.length();
    for(int i = strleng;i>=0;i--)
    {
        if(test_str[i] == '<')
        {
            test_str[i] = ' ';
            for(int j = i+1;j<(strleng-5);j++)
                test_str[j] = test_str[j+6];
            for(int j = strleng-5;j<(strleng);j++)
                test_str[j] = ' ';
        }
    }
    return test_str;
}

int main()
{
    ifstream in("/home/calco/桌面/chapter.txt");
    ofstream out("/home/calco/桌面/chapterContent.txt");
    if(!in.is_open())
    {
        perror("File chapter open is failed");
        EXIT_FAILURE;
    }
    if(!out.is_open())
    {
        perror("File content open is failed");
        EXIT_FAILURE;
    }

    char temp[500];
    int flag = 0;
    while(in.getline(temp,500))
    {
        string s = temp;
        std::smatch m;
        std::regex e ("novel_title\"\>(.*)\</h");
        while(std::regex_search(s,m,e))//获取小说的章节名
        {
            std::regex reg1("^(\\s)*");
            std::regex reg2("\\s*$");

            string test_str(m[1]);
            string t("");
            test_str = std::regex_replace(test_str,reg1,t); //去除左边的空格
            test_str = std::regex_replace(test_str,reg2,t); //去除右边空格
            out<<test_str<<endl;
            s = m.suffix().str();
        }
        if(flag)
        {
            std::regex e3 ("\</div");
            if(!std::regex_search(s,e3))
            {
                string s_Handled = HandleString(s);
                out<<s_Handled<<endl;
            }
            else
                break;
        }
        if(!flag){
        std::regex e2 ("novel_content\"\>");
        if(std::regex_search(s,e2))
        {flag = 1;}
        }
    }

    in.close();
    out.close();
    return 0;
}

这个没用regex_replace,用的find和repalce,结果问题很大,以后再改吧

猜你喜欢

转载自blog.csdn.net/amous_x/article/details/80202295