单线程简单爬虫

要求:

1.给定了一个网页网址(URL),这个就是我们爬虫项目的入口网页,从哪开始爬

http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml

2.把当天的新闻内容全部爬取保存到本地文件中

3.方便以后我们可以迅速查找(在本地文件中)某个新闻,供我们做分析使用

这里为了简化,我们的要求就是找到对应的新闻内容打印输出到控制台System.out.println()

4.爬取的数据按天为单位划分目录,一天生成一个文件夹,文件夹下有2个文件,

一个是数据文件(存储爬取的所有新闻),一个是索引文件(存储某个新闻对应的位置,方便我们查找)。

爬取前先分析网页结构,找到自己需要内容的部分,编写正则表达式

一、建立一个maven项目

二、查取网页内容并储存在对应的数据文件和索引文件

1、建立所需工具包

/**
 * 用于关流
 * @author Administrator
 *
 */
public class CloseUtil {
    public static void close(AutoCloseable obj) {
        if (obj != null) {
            try {
                obj.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
}
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
/**
 * 爬取网页内容
 * @author Administrator
 *
 */
public class WebUtil {
    /**
     * 将爬取的内容以字符形式返回
     * @param urlStr
     * @param encoding
     * @return
     */
    public static String urlGetString(String urlStr, String encoding) {
        StringBuffer sb = new StringBuffer();
        URL url = null;
        URLConnection conn = null;
        BufferedReader br = null;
        try {
            url = new URL(urlStr);
            conn = url.openConnection();
            br = new BufferedReader(new InputStreamReader(
                    conn.getInputStream(), encoding));
            String line = null;
            while ((line = br.readLine()) != null) {
                sb.append(line).append(System.lineSeparator());
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            CloseUtil.close(br);

        }

        return sb.toString();
    }
    /**
     * 将爬取内容以字节数组形式返回
     * 以便查取对应新闻内容的长度
     * @param urlStr
     * @return
     */
    public static byte[] urlGetByteArray(String urlStr) {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        BufferedInputStream bis = null;
        byte[] byteArray = new byte[0];
        try {
            URL url = new URL(urlStr);
            URLConnection conn = url.openConnection();
            bis = new BufferedInputStream(conn.getInputStream());
            int b = -1;
            while ((b = bis.read()) != -1) {
                baos.write(b);
            }
            byteArray = baos.toByteArray();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            CloseUtil.close(bis);
            CloseUtil.close(baos);

        }
        return byteArray;
    }

}
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 利用正则表达式从爬去的内容中
 * 筛选出需要的内容
 * @author Administrator
 *
 */

public class RegexUtil {
    public static String match(String input, String regex) {
        StringBuffer sb = new StringBuffer();
        Pattern p = Pattern.compile(regex);
        Matcher m = p.matcher(input);
        while (m.find()) {
            String result = m.group();
            sb.append(result);
        }
        return sb.toString();
    }

    public static String match(String input, String regex, int grpNum) {

        String result = "";
        Pattern p = Pattern.compile(regex);
        Matcher m = p.matcher(input);
        while (m.find()) {
            result = m.group(grpNum);

        }
        return result;
    }

    public static List<String> matchList(String input, String regex) {

        List<String> list = new ArrayList<String>();
        Pattern p = Pattern.compile(regex);
        Matcher m = p.matcher(input);
        while (m.find()) {
            String result = m.group();
            list.add(result);
        }
        return list;
    }

}
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
/**
 * 将爬取的内容输出到硬盘
 * @author Administrator
 *
 */
public class IOUtil {
    public static void writeDataFile(String dataFile, byte[] ba){
        OutputStream os = null;
        try{
            os = new FileOutputStream(dataFile, true);
            os.write(ba);
        }catch(Exception e){
            e.printStackTrace();
        }finally{
            CloseUtil.close(os);
        }
        
    }
    public static void writeIndexFile(String indexFile,String str){
        PrintWriter pw = null;
        try{
            pw = new PrintWriter(new FileOutputStream(indexFile, true));
            pw.println(str);
        }catch(Exception e){
            e.printStackTrace();
        }finally{
            CloseUtil.close(pw);
        }
        
    }

}

2、准备就绪开始爬取

import java.io.File;
import java.util.List;
import cn.dd.util.IOUtil;
import cn.dd.util.RegexUtil;
import cn.dd.util.WebUtil;

public class Spider {
    public static void main(String[] args) {
        Spider.crawler();
    }

    public static void crawler() {

        String urlStr = "http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml";
        String encoding = "gb2312";
        String input = WebUtil.urlGetString(urlStr, encoding);// 爬取列表页内容
        String ulRegex = "<ul class=\"list_009\">[\\s\\S]*?</ul>";// 正则表达式
        String ulResult = RegexUtil.match(input, ulRegex);
        String liRegex = "<li>[\\s\\S]*?</li>";// 正则表达式
        List<String> list = RegexUtil.matchList(ulResult, liRegex);
        for (String str : list) {
            String grpRegex = "<li><a href=\"([\\S]*?)\" target=\"_blank\">([\\S\\s]*?)</a><span>\\(([\\S]*?) [\\S]*?\\)</span></li>";
            String liUrlStr = RegexUtil.match(str, grpRegex, 1);
            String liTitle = RegexUtil.match(str, grpRegex, 2);
            String liDate = RegexUtil.match(str, grpRegex, 3);
            Spider.detailProcessor(liUrlStr, liTitle, liDate);
        }
    }

    public static void detailProcessor(String liUrlStr, String liTitle,
            String liDate) {
        byte[] ba = WebUtil.urlGetByteArray(liUrlStr);// 爬取详情页
        String fileBaseDir = "F:" + File.separator + "something"
                + File.separator + liDate + File.separator;
        File fileBaseDirObj = new File(fileBaseDir);
        if (!fileBaseDirObj.exists()) {
            fileBaseDirObj.mkdirs();
        }
        String dataPath = fileBaseDir + "spider_data.dat";
        String indexPath = fileBaseDir + "spider_index.dat";
        File dataFile = new File(dataPath);
        long pos = dataFile.length();
        StringBuffer sb = new StringBuffer();
        char c = '\u0001';// 将储存的索引文件各部分用分隔符间隔开。
        sb.append(liTitle).append(c).append(pos).append(c).append(ba.length)
                .append(c).append(liUrlStr);
        IOUtil.writeDataFile(dataPath, ba);
        IOUtil.writeIndexFile(indexPath, sb.toString());
    }
}

爬取内容基本完成,运行可生成对应要求的数据文件和索引文件

三、客户端的建立

1、编写所需工具包

import java.io.RandomAccessFile;

public class IndexUtil {
    public static String index(String pos, String size, String dataFile) {

        String encoding = "utf-8";
        String str = "";
        RandomAccessFile raf = null;
        try {
            raf = new RandomAccessFile(dataFile, "r");
            Long po = Long.valueOf(pos);
            raf.seek(po);
            Integer si = Integer.valueOf(size);
            byte[] b = new byte[si];
            raf.read(b);
            str = new String(b, encoding);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            CloseUtil.close(raf);
        }
        return str;
    }

    public static String index(String pos, String size, String dataFile,String encoding) {
        String str = "";
        RandomAccessFile raf = null;
        try {
            raf = new RandomAccessFile(dataFile, "r");
            Long po = Long.valueOf(pos);
            raf.seek(po);
            Integer si = Integer.valueOf(size);
            byte[] b = new byte[si];
            raf.read(b);
            str = new String(b, encoding);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            CloseUtil.close(raf);
        }
        return str;
    }
}

2、编写客户端

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import cn.dd.util.CloseUtil;
import cn.dd.util.IndexUtil;

public class SpiderIndex {
    public static void main(String[] args) {
        String str = "http://news.sina.com.cn/c/nd/2018-08-23/doc-ihicsiav8438010.shtml";
        SpiderIndex.input(str);
    }

    public static void input(String str) {
        String indexFile = "F:" + File.separator+ "/something/08月23日/spider_index.dat";
        String dataFile = "F:" + File.separator+ "/something/08月23日/spider_data.dat";
        BufferedReader bu = null;
        try {
            bu = new BufferedReader(new InputStreamReader(new FileInputStream(indexFile), "utf-8"));
            String len = null;
            while ((len = bu.readLine()) != null) {
                String[] st = len.split("\u0001");
                if (str.equals(st[3])) {
                    String s = IndexUtil.index(st[1], st[2], dataFile);
                    System.out.println(s);
                    break;
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            CloseUtil.close(bu);
        }
    }
}

猜你喜欢

转载自www.cnblogs.com/ddaifenxiang/p/9610816.html