原生Java使用Socket制作爬虫重复造轮子系列-一步步带-2

Hello，大家好，我是Shendi，这次给大家带来爬虫系列2，爬取网页图片

纯手码，如果有地方有点小错误请指出

这次将上次的代码改进了很多，我们自己写的Socket封装类：

public HttpSocketUtils(String reqType,String hostArg, int port, int outTime) {
		try {
			this.host = hostArg;
			this.port = port;
			this.outTime = outTime;
			//判断是否有子项目 有则获取子项目
			int index = host.indexOf('/');
			if (index != -1) {
				subitem = host.substring(index,host.length());
				//判断是否是根项目 是则在最后方添加/
				int subIndex = subitem.substring(1,subitem.length()).indexOf('/');
				if (subIndex == -1) {
					subitem += '/';
				}
				this.host = host.substring(0,index);
			}
			if ("".equals(subitem)) {
				subitem = "/";
			}
			//获取连接
			socket = new Socket(host, port);
			socket.setSoTimeout(outTime);
			// 初始化协议
			protocol = new Protocol() {
				public HttpUtils openHttpConn() {
					try {
						//输出http协议
						output = socket.getOutputStream();
						output.write((reqType+" " + subitem + " HTTP/1.1\r\n").getBytes());
						output.write(("HOST: "+ host +" \r\n").getBytes());
						output.write("\r\n".getBytes());
						//解析 获取结果
						input = new InputStreamUtils(socket.getInputStream());
						HttpUtils http = new HttpUtils(input);
						//保存cookie
						return http;
					} catch (IOException e) {
						e.printStackTrace();
						WebLog.printErr(e.getMessage());
					}
					return null;
				}
			};
		} catch (IOException e) {
			e.printStackTrace();
			WebLog.printErr(e.getMessage());
			close();
		}
	}

这里增加了个判断是否为根项目，根项目后面需要加/不是根项目则不需要

改进了HttpUtils(获取数据数据头最后一个不是Content-Length会出错的问题+没有Content-Length获取数据体后不会跳出循环的问题 + 将数据体不仅以字符串形式保存格外用byte[]形式保存了一份)

private void Initialize() throws IOException {
		if (input == null) {
			throw new IOException("input is null");
		}
		//获取状态
		StringBuffer dataHead = new StringBuffer();
		String data = null;
		//第一行为状态
		data = input.readLine();
		if (data != null) {
			dataHead.append(data);
			String[] datas = data.split(" ");
			if (datas.length > 1) {
				state = datas[1];
				if (datas.length > 2) {
					stateInfo = datas[2].substring(0,datas[2].length()-2);
				}
			}
			//快速释放资源
			datas = null;
		}
		int dataBodySize = -1;
		//获取数据头 
		//TODO 此次优化了数据头最后一个键值对不是Content-Length的问题
		while ((data = input.readLine()) != null) {
			//取得长度后 获取数据
			String[] map = data.split(":");
			//先判断是否为键值对
			if (map.length > 1) {
				dataHead.append(data);
				String key = map[0].trim();
				String value = map[1].trim();
				//判断类型 获取编码 类型不是text/html则使用流读取的方式获取数据
				if ("Content-Type".equals(key)) {
					String[] datas = value.trim().split(";");
					if (datas.length > 0) {
						if ("text/html".equals(datas[0])) {
							
						}
					}
					
				//获取数据长度
				} else if ("Content-Length".equals(key)) {
					dataBodySize = Integer.parseInt(value);
				}
			} else {
				//不是键值对则跳出
				break;
			}
		}
		//去掉换行符
		dataHead.substring(0,dataHead.length()-2);
		if (dataHead != null && !"".equals(dataHead.toString())) {
			head = dataHead.toString();
		}
		//TODO 优化获取数据身的问题 改进了input.read获取的数据长度不对的问题
		//获取数据体
		if (dataBodySize != -1) {
			byte[] dataBody = new byte[dataBodySize];
			//获取数据体
			int size = 0;
			while (size < dataBodySize) {
				byte[] temp = new byte[dataBodySize];
				int len = input.read(temp);
				System.arraycopy(temp,0,dataBody,size,len);
				size+=len;
			}
			body = new String(dataBody);
			bodyByte = dataBody;
		//直接读取数据
		} else {
			byte[] dataBody = new byte[0];
			byte[] bytes = new byte[1024];
			int len = -1;
			while ((len = input.read(bytes)) != -1) {
				byte[] temp = dataBody;
				dataBody = new byte[temp.length + len];
				System.arraycopy(temp, 0, dataBody, 0, temp.length);
				System.arraycopy(bytes, 0, dataBody, temp.length,len);
				//判断是否到结尾
				if (dataBody.length > 8) {
					if (((char)dataBody[dataBody.length-1] == '\n') && ((char)dataBody[dataBody.length-2] == '\r') && ((char)dataBody[dataBody.length-3] == '\n') && ((char)dataBody[dataBody.length-4] == '\r') && ((char)dataBody[dataBody.length-5] == '0') && ((char)dataBody[dataBody.length-6] == '\n') && ((char)dataBody[dataBody.length-7] == '\r')) {
						break;
					}
				}
			}
			body = new String(dataBody);
			bodyByte = dataBody;
		}
		
	}

获取参数的get set方法自行添加以及参数这里将没有Content-Length数据头改为判断数据体中最后几个字节

如果以\r\n0\r\n\r\n结尾的读取完毕则代表读取完毕

以上的改好之后,就是我们扩展的时候了

我们需要获取图片,所以

新建一个HttpFilterDataUtils类内容如下

/**
 * -过滤爬到的Http的数据
 * @author <a href='tencent://AddContact/?fromId=45&fromSubId=1&subcmd=all&uin=1711680493'>Shendi</a>
 */
public class HttpFilterDataUtils {
	
	/**
	 * -获取数据中的图片路径
	 * @return 图片路径数组 不会为空
	 */
	public static String[] getHttpImage(String data) {
		//返回的图片url
		String[] datas = new String[1024];
		//有效数据长度
		int len = 0;
		//截取数据
		while (data.length() != 0) {
			//判断数组大小是否足够
			if (len >= datas.length) {
				//扩充长度
				String[] temp = datas;
				datas = new String[datas.length+1024];
				System.arraycopy(temp,0,temp,0,len);
			}
			//截取img标签的内容
			int startLen = data.indexOf("<img");
			if (startLen == -1) {
				break;
			}
			//将获取到的前面的数据删去
			data = data.substring(startLen,data.length());
			int endLen = data.indexOf(">");
			if (endLen == -1) {
				break;
			}
			//得到图片标签
			String img = data.substring(0,endLen+1);
			//从源数据中删去这段数据
			data = data.substring(endLen,data.length());
			//提取图片标签中的url 因为需要根据空格拆分开 则直接根据空格分隔
			String[] elements = img.split(" ");
			//取得src
			for (String element : elements) {
				String[] map = element.split("=");
				if (map.length > 1) {
					String key = map[0].trim();
					String value = map[1].trim();
					if ("src".equals(key)) {
						//取得 url
						value = value.substring(1,value.length()-1);
						datas[len] = value;
					}
				}
			}
			len++;
		}
		//处理返回的数据
		String[] returnDatas = new String[len];
		System.arraycopy(datas, 0, returnDatas, 0, len);
		return returnDatas;
	}
	
	/**
	 * -获取标题
	 * @return
	 */
	public static String getTitle(String data) {
		if (data == null || "".equals(data)) {
			return null;
		}
		int startLen = data.indexOf("<title>");
		int endLen = data.indexOf("</title>");
		return data.substring(startLen+7,endLen);
	}
	
}

此类是为了过滤掉爬虫爬到的数据,这里我写了获取标题和获取图片的函数

然后新建一个HttpFileUtils类

/**
 * -Http的文件工具类 下载文件等
 * @author <a href='tencent://AddContact/?fromId=45&fromSubId=1&subcmd=all&uin=1711680493'>Shendi</a>
 */
public class HttpFileUtils {
	/**
	 * -获取图片根据url
	 * @param socket 如果不为null 则使用此socket获取 --拥有cookie信息,null则新的socket访问
	 * @param host 主机名 例www.baidu.com 如果socket不为null 则此参数可以为null
	 * @param url 图片的路径
	 */
	public static byte[] getImageByUrl(HttpSocketUtils socket,String host,String url) {
		//判断参数
		if (url == null || "".equals(url)) {
			throw new RuntimeException("url is null by getImage");
		}
		//socket为空 则使用传递的主机名 使用旧socket 不为空则用socket的主机名以及使用新socket
		if (socket == null) {
			//先判断url是否是带主机名的 如果带 则直接使用url新建 否则使用主机名+url
			if (url.indexOf("www.") != -1 && url.indexOf("www.") < 4) {
				socket = new HttpSocketUtils(url, 80);
			} else {
				socket = new HttpSocketUtils(host+"/"+url, 80);
			}
		} else {
			
		}
		return socket.openHttpConn().getBodyByte();
	}
	/**
	 * 
	 * @param image 图片的字符串表现形式
	 * @param fileName 文件名
	 * @param type 图片的类型 例：jpg
	 * @param savePath 保存的路径
	 * @throws IOException 
	 */
	public static void saveImageByPath(byte[] image,String fileName,String type,String savePath) throws IOException {
		//判断参数
		if (savePath == null || "".equals(savePath)) {
			throw new RuntimeException("savePath is null by getImage");
		}
		if (type != null && !"".equals(type)) {
			type = "."+type;
		}
		//保存
		File file = new File(savePath);
		if (!file.exists()) {
			file.mkdirs();
		}
		OutputStream output = new FileOutputStream(savePath+"/"+fileName+type);
		output.write(image);
		output.close();
	}
	
}

这里也是提供了两个函数,一个是从指定url中爬取图片,可以指定socket(后期扩展,比如有些网站需要cookie等),还有一个将图片保存到本地的方法

写完后我们就能够爬取图片了,测试代码

public class GetHttp {
	public static void main(String[] args) throws MalformedURLException, IOException {
		HttpSocketUtils socket = new HttpSocketUtils("www.png9.cn",80);
		HttpUtils http = socket.openHttpConn();
		if (http == null) {
			return;
		}
		String body = http.getBody();
		//获取数据
		String[] head = HttpFilterDataUtils.getHttpImage(body);
		int i = 0;
		for (String s : head) {
			System.out.println(s);
			byte[] image = HttpFileUtils.getImageByUrl(null,socket.getHost(),s);
			HttpFileUtils.saveImageByPath(image,""+i++,"jpg","F:/");
		}
		socket.close();
}

WebLog类是我的日志类,此外,还会有一些小问题让我们爬不到图片,比如网络权限等

如果对你有帮助的话请点个赞吧~ 关注我,了解更多好玩的有趣的...

Hack神帝

发布了38 篇原创文章 · 获赞 23 · 访问量 9065

私信关注

原生Java使用Socket制作爬虫重复造轮子系列-一步步带-2

猜你喜欢