网页爬虫抓取URL简单实现

关键字：网页爬虫抓取URL简单实现 .

//开始......

package com.ogilvy.sayes.util;

import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Hashtable;

/*
Description:     爬网页用
Author     :     long.tang
*/

public class SearchCrawler {

    public String myGetHttpFile2(String url) {

        String urlSource = url;
        StringBuffer htmlBuffer = new StringBuffer();
        String returnStr = null;
        try {
            InputStream imageSource = new URL(urlSource).openStream();
            int ch;
            while ((ch = imageSource.read()) > -1) {
                htmlBuffer.append((char) ch);
            }
            imageSource.close();
            returnStr = new String(htmlBuffer);
            returnStr = new String(returnStr.getBytes("ISO8859_1"), "GBK");
        } catch (Exception e) {
            System.out.println("error>>>>");
            e.printStackTrace();
        }

        //System.out.println("@@@:" + returnStr);
        if (returnStr != null) {
            return returnStr;
        } else {
            return "nothing";
        }

    }

    public void doit(String content, int depth) throws Exception {

        depth--;
        if (depth < 1) {
            //System.out.println("break::::");
            return;
        }

        SearchCrawler search = new SearchCrawler();
        ArrayList list = new ArrayList();
        int j = 0;
        String start = "href=";
        String end = "\"";
        String url = "";
        String type = "http";
        String[] urls;
        while (content.indexOf(start, j) > -1) {

                url = content.substring(content.indexOf(start, j) + 6, content.indexOf(end, content.indexOf(start, j) + 6));//+6 href="
                if (url.indexOf(type) > -1) {
                    if (url.indexOf(".css") == -1&&url.indexOf(".ico") == -1&&url.indexOf(".exe") == -1) {
                        System.out.println(url);

                        list.add(url);

                        if (list != null && list.size() > 0) {

                            for (int k = 0; k < list.size(); k++) {
                                doit(search.myGetHttpFile2(String.valueOf(list.get(k))), depth);

                            }

                        }
                    }

                }


            j = content.indexOf(start, j) + 1;

        }

    }

    public static void main(String arg[]) {

        SearchCrawler search = new SearchCrawler();
        try {
            search.doit(search.myGetHttpFile2("http://www.2345.com/"),3);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

}

//结束.....

网页爬虫抓取URL简单实现

猜你喜欢