手写网络爬虫获取网页源代码

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/sinat_27933301/article/details/81274891
import java.io.*;
import java.net.*;

public class BigData {

    /**
     * 通过url地址获取网页源代码
     * 
     * */
    public static String getHtmlResource(String url, String encoding){
        StringBuffer sb=new StringBuffer();
        try {
            URL urlObj = new URL(url);
            URLConnection uc = urlObj.openConnection();
            InputStreamReader isr = new InputStreamReader(uc.getInputStream(), encoding);
            BufferedReader br = new BufferedReader(isr);
            String temp = null;
            while((temp=br.readLine()) != null){
                sb.append(temp);
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
            System.out.println("网络不通!");
        } catch (IOException e) {
            e.printStackTrace();
            System.out.println("网络连接异常!");
        }
        return sb.toString();
    }

    /**
     * 写文件
     * 
     * */
    public static void writeFile(String content, String filePath){
        PrintWriter pw = null;
        try {
            File file = new File(filePath);
            pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"));
            pw.print(content);
            pw.flush();
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } finally{
            if(pw != null){
                pw.close();
            }
        }
    }

    public static void main(String[] args) {
        String content = getHtmlResource("https://www.taobao.com", "UTF-8");
        writeFile(content, "D:\\bigdata.txt");
    }
}

猜你喜欢

转载自blog.csdn.net/sinat_27933301/article/details/81274891
今日推荐