java实现自动下载免费小说

在百度上看到别人要下一些小说,就想自己写个下小说的玩玩。

以下为自己玩玩的,不必当真。



package edu.uci.ics.crawler4j.test;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * 
 * @author andy.wang
 * @时间 2017年5月27日
 */
public class Plzl {
	
	public static final int READ_SIZE = 8 * 1024;

	private static String path="F://three//";
	private static String domainName="http://www.58xs.com";
	
	
	public static void main(String[] args) throws Exception {
		String s=domainName+"/html/122/122467/6226722.html";
		
		Plzl p = new Plzl();
		p.requestByGetMethod(s);
		for (int i = 0; i < 151; i++) {
			System.out.println("info:"+s);
			String u=p.requestByGetMethod(s);
			if ("".equals(u)) {//如果没拿到下一章的就再拿一次
				u=p.requestByGetMethod(s);
				if ("".equals(u)) {
					System.out.println("error:"+s);//如果还是出错就报错出来
				}
			}
			if (!"".equals(u)) {
				s=u.trim();
			}
			Thread.sleep(2000);
		}

	}
	
	 private CloseableHttpClient getHttpClient(){
	        return HttpClients.createDefault();
	    }
	     
	    private void closeHttpClient(CloseableHttpClient client) throws IOException{
	        if (client != null){
	            client.close();
	        }
	    }
	
	
	/**
	 * 通过GET方式发起http请求
	 */
	public String requestByGetMethod(String url){
		String u="";
		//创建默认的httpClient实例
		CloseableHttpClient httpClient = getHttpClient();
		try {
			//用get方法发送http请求
			HttpGet get = new HttpGet(url);
			CloseableHttpResponse httpResponse = null;
			//发送get请求
			httpResponse = httpClient.execute(get);
			try{
				//response实体
				HttpEntity entity = httpResponse.getEntity();
				if (null != entity){
					String s =EntityUtils.toString(entity,"utf-8");		
					u= read(s);
				}
			}
			finally{
				httpResponse.close();
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		finally{
			try{
				closeHttpClient(httpClient);
			} catch (IOException e){
				e.printStackTrace();
			}
		}
		return u;
	}
	

	
	private String read(String s) throws IOException{
		String u="";
		String txt =path+"//pmzlx.txt";
		// 创建文件输出流,第二个参数表示为追加的方式输出
		OutputStream os = new FileOutputStream(txt, true);
		// 转换为字符输出流,并指定文件编码为utf-8
		OutputStreamWriter osw = new OutputStreamWriter(os, "UTF-8");		
		// 以自定编码的方式读取输入流
		if (s.lastIndexOf("content")>0) {
			String tilte = s.substring(s.indexOf("<h1>")+4,s.indexOf("</h1>"));//获取章节名称
			osw.write(tilte+"\r\n");
			s =s.substring(s.indexOf("下一章"),s.lastIndexOf("下一章"));//截取内容
			u = s.substring(s.length()-31, s.length()-2);//获取下一章的url
			s=s.substring(s.lastIndexOf("content")+9,s.length());//截取正文
			s=s.substring(0,s.indexOf("上一章"));//截取正文
			s=delHTMLTag(s);//去除html并且
			osw.write(s.replaceAll("    ", "\r\n")+"\r\n");//空格替换为换行然后写到txt文本里
			osw.flush();
		}
		osw.close();
		os.close();
		if ("".equals(u)) {
			return u;
		}
		return domainName+u;
	}
	
    public String delHTMLTag(String htmlStr){ 
        String regEx_script="<script[^>]*?>[\\s\\S]*?<\\/script>"; //定义script的正则表达式 
        String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; //定义style的正则表达式 
        String regEx_html="<[^>]+>"; //定义HTML标签的正则表达式 
         
        Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE); 
        Matcher m_script=p_script.matcher(htmlStr); 
        htmlStr=m_script.replaceAll(""); //过滤script标签 
         
        Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE); 
        Matcher m_style=p_style.matcher(htmlStr); 
        htmlStr=m_style.replaceAll(""); //过滤style标签 
         
        Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE); 
        Matcher m_html=p_html.matcher(htmlStr); 
        htmlStr=m_html.replaceAll(""); //过滤html标签 

        return htmlStr.trim(); //返回文本字符串 
    } 
	

}











猜你喜欢

转载自blog.csdn.net/wangzhi291/article/details/72782506