JAVA HTML parsing process

Copyright: arbitrary, seeding https://blog.csdn.net/qq_32662595/article/details/88189956!
  1. Upload the compressed file to the server
	public List<FileVo> uploadFile(HttpSession session, MultipartFile[] files, String path, String user) {

		for (MultipartFile file : files) {
			if (!file.isEmpty()) {
				String originalName = file.getOriginalFilename();
				File savedFile = new File(path, originalName);
				try {
					FileUtils.copyInputStreamToFile(file.getInputStream(), savedFile);
				} catch (IOException e) {
					Logger.error("文件上传失败!");
				}
				try {
					Fileupload upLoadFile = new Fileupload();
					BeanUtils.copyProperties(fv, upLoadFile);
				//.......可以添加文件上传记录
				} catch (Exception e) {
				//如果当前文件解析失败,继续解析下一个文件
				//此处可添加错误文件解析记录
					continue;
				}
			}
		}
		return allFiles;
	}
返回上传完成的文件的信息,重点是文件保存的路径及文件名
  1. unzip files

    There are three forms of compressed files: JAR / ZIP / RAR

public void decompressFile(String url, String fileName, String fileType, String encode) {
		if (StringUtils.isNotBlank(fileType.toUpperCase())) {
			switch (fileType.toUpperCase()) {
			case MYJZConfig.UPLOADDECOMPRESSTYPE_JAR:
				ZipOrJarFileAction.getJarFileDecompression(url + fileName, url);
				break;

			case MYJZConfig.UPLOADDECOMPRESSTYPE_ZIP:
				ZipOrJarFileAction.getDecompressionFile(url + fileName, url, encode);
				break;
			case MYJZConfig.UPLOADDECOMPRESSTYPE_RAR:
				ZipOrJarFileAction.unrar(url + fileName, url);
				break;
			default:
				throw new CustomException("目前支持JAR、ZIP、RAR,压缩文件类型不支持!");
			}
		}

	}
解析JAR:
public static String getJarFileDecompression(String jarFilePath, String decompressionJarPath) {
		JarFile jarFile = null;
		File saveDir = new File(decompressionJarPath);
		if (saveDir.isDirectory()) {
			saveDir.mkdirs();
		}

		InputStream is = null;
		FileOutputStream fos = null;

		try {
			jarFile = new JarFile(jarFilePath);
			Enumeration<JarEntry> jarItems = jarFile.entries();
			while (jarItems.hasMoreElements()) {
				JarEntry jarItem = jarItems.nextElement();
				// 判断是否是文件夹
				if (!jarItem.isDirectory()) {
					int index = jarItem.getName().lastIndexOf("/");
					String dir = "";
					if (index > 0) {
						dir = jarItem.getName().substring(0, index);
					}
					new File(decompressionJarPath + dir).mkdirs();
					File newFile = new File(decompressionJarPath + jarItem.getName());
					if (!newFile.exists()) {
						newFile.createNewFile();
					}
					is = jarFile.getInputStream(jarItem);
					fos = new FileOutputStream(newFile);
					byte[] b = new byte[1024];
					int len;
					while ((len = is.read(b)) != -1) {
						fos.write(b, 0, len);
					}
				}
			}
		} catch (IOException e) {
			logger.error("JAR文件解压失败 !");
		} finally {
			closeAll(fos, is, null, null, null, jarFile);
		}
		return decompressionJarPath;
	}

	/**
	 * 关闭流 这个地方的书写特点是,流的关闭顺序一般是从输出流到输入流 例如如下代码: FileInputStream fis=new
	 * FileInputStream("D:/test.txt"); BufferedInputStream bis=new
	 * BufferedInputStream(fis); FileOutputStream fos=new
	 * FileOutputStream("D:/test1.txt"); BufferedOutputStream bos=new
	 * BufferedOutputStream(fos);
	 *
	 * 我们在进行流关闭时是自下而上,所以顺序是bos->fos->bis->fis
	 * 
	 * 而用该方法来关闭是为了省略代码,技巧在于1)自己知道自己所用流的顺序2)传参数时的小技巧, 可以这样调用
	 * closeAll(fos,null,null,null,null); closeAll(null,bis,null,null,null);
	 * 这样就可以保证顺序不出错了
	 * 
	 * @param fos
	 *            文件输出流
	 * @param is
	 *            输入流
	 * @param zf
	 *            压缩文件
	 * @param zos
	 *            压缩文件输出流
	 * @return null
	 */
	private static void closeAll(FileOutputStream fos, InputStream is, ZipFile zf, ZipOutputStream zos,
			BufferedInputStream bis, JarFile jarFile) {
		if (null != fos) {
			try {
				fos.close();
			} catch (IOException e) {
				fos = null;
			}
		}
		if (null != bis) {
			try {
				bis.close();
			} catch (IOException e) {
				bis = null;
			}
		}
		if (null != zos) {
			try {
				zos.close();
			} catch (IOException e) {
				zos = null;
			}
		}
		if (null != is) {
			try {
				is.close();
			} catch (IOException e) {
				is = null;
			}
		}
		if (null != zf) {
			try {
				zf.close();
			} catch (IOException e) {
				zf = null;
			}
		}
		if (null != fos) {
			try {
				fos.close();
			} catch (IOException e) {
				fos = null;
			}
		}
		if (null != is) {
			try {
				is.close();
			} catch (IOException e) {
				is = null;
			}
		}
		if (null != jarFile) {
			try {
				jarFile.close();
			} catch (IOException e) {
				jarFile = null;
			}
		}
	}

Decompression zip- "

public static String getDecompressionFile(String zipFilePath, String deCompressionPath, String readEncoding) {
		FileOutputStream fos = null;
		InputStream is = null;
		ZipFile zipFileItems = null;
		String zipPath = null;
		try {
			// 以指定格式打开zip文件
			zipFileItems = new ZipFile(zipFilePath, Charset.forName(readEncoding));
			// 介意后的文件夹名
			zipPath = deCompressionPath + zipFileItems.getName();
			// 获取zip文件的条目
			Enumeration<? extends ZipEntry> items = zipFileItems.entries();
			while (items.hasMoreElements()) {
				// 压缩文件中当前的所操作文件或文件夹的对象
				ZipEntry zipCurrentItem = items.nextElement();
				/*
				 * zipCurrentItem的具体定位流程演示(每次循环的值) 1)/0_dir.txt 2)/1_dir.txt
				 * 3)/2_dir.txt 4)/3_dir.txt 5)/test/0_tes.txt 6)/test/1_tes.txt
				 * 7)/test/2_tes.txt 8)/test/3_tes.txt
				 */
				if (zipCurrentItem.isDirectory()) {// 遇到文件夹时深入文件夹递归
					File newFile = new File(deCompressionPath + zipCurrentItem.getName());
					if (!newFile.exists()) {
						newFile.mkdirs();
					}
				} else {
					// 获取当前条目文件的输入流
					is = zipFileItems.getInputStream(zipCurrentItem);
					byte[] b = new byte[1024];
					File newFile = new File(deCompressionPath + zipCurrentItem.getName());
					if (!newFile.exists()) {
						new File(deCompressionPath).mkdirs();// 创建路径
						newFile.createNewFile();
					}
					fos = new FileOutputStream(newFile);
					int len;
					while ((len = is.read(b)) != -1) {
						fos.write(b, 0, len);
					}
				}
			}
			zipFileItems.close();
		} catch (FileNotFoundException e) {
			logger.error("你输入的文件路径有误,请检查文件路径及其后缀是否正确!");
		} catch (IOException e) {
			logger.error("文件流异常断开!");
		} finally {
			closeAll(fos, is, zipFileItems, null, null, null);
		}
		return zipPath;
	}

Extract the rar:

public static void unrar(String orgPath, String disPath) {
		Archive archive = null;
		FileOutputStream fos = null;
		try {
			archive = new Archive(new NativeStorage(new File(orgPath)));
			if (archive.isEncrypted()) {
			}
			FileHeader fh = archive.nextFileHeader();
			File destFileName = null;
			while (fh != null) {
				String compressFileName = fh.getFileNameString().trim();
				destFileName = new File(disPath + compressFileName);
				if (fh.isDirectory()) {
					if (!destFileName.exists()) {
						destFileName.mkdirs();
					}
					fh = archive.nextFileHeader();
					continue;
				}
				if (!destFileName.getParentFile().exists()) {
					destFileName.getParentFile().mkdirs();
				}
				fos = new FileOutputStream(destFileName);
				archive.extractFile(fh, fos);
				fos.close();
				fos = null;
				fh = archive.nextFileHeader();
			}

			archive.close();
			archive = null;
		} catch (Exception e) {
			throw new CustomException("RAR 文件解压异常!");
		} finally {
			if (fos != null) {
				try {
					fos.close();
				} catch (Exception e) {
					fos = null;
				}
			}
			if (archive != null) {
				try {
					archive.close();
				} catch (Exception e) {
					archive = null;
				}
			}
		}
	}

  1. Parsing the file contents
public static DecompressVo parseHtml(String path) {
		// 网页数据
		Document doc = Jsoup.parse(readHtml(path));
		Map<String, String> dataMap = new HashMap<>();
		// 查询基本信息
		List<String> baseInfo = new LinkedList<>();
		Elements peronalAndOffices = doc.getElementsByTag("section");

		if (peronalAndOffices != null) {
			for (Element peronalAndOffice : peronalAndOffices) {
				Elements pTagInfos = peronalAndOffice.getElementsByTag("p");
				for (Element ptag : pTagInfos) {
					if (ptag.hasText()) {
						if (ptag.childNodes().size() < 2) {
							if (("Photo").equalsIgnoreCase(ptag.text())) {
								// 获取图片路径
								Elements pics = peronalAndOffice.getElementsByTag("img");
								for (Element pic : pics) {
									// 将获取到的图片进行base64编码
									dataMap.put("photo", pic.attr("src"));
								}
							}
						} else {
							Elements perosonals = ptag.children();
							for (Element personal : perosonals) {
								baseInfo.add(personal.text());
							}
						}

					}
				}
			}
		}

		for (int i = 0; i < baseInfo.size(); i += 2) {
			if (StringUtils.isNotBlank(baseInfo.get(i))) {
				dataMap.put(baseInfo.get(i), baseInfo.get(i + 1));
			}
			
		}
		// 记录
		List<List<String>> attendList = new ArrayList<>();
		List<List<String>> docList = new ArrayList<>();
		Elements attendAndDocment = doc.getElementsByTag("table");
		if (null != attendAndDocment && attendAndDocment.size() == 2) {
			
			Element attends = attendAndDocment.get(0);
			Elements attendTr = attends.getElementsByTag("tr");
			for (Element tr : attendTr) {
				Elements contents = tr.getElementsByTag("td");
				if (null != contents && !contents.isEmpty()) {
					List<String> tds = new LinkedList<>();
					for (Element td : contents) {
						tds.add(td.text());
					}
					if (tds.size() > 1) {
						attendList.add(tds);
					}
				}
			}

			// 文档信息
			Element docs = attendAndDocment.get(1);
			Elements docsTr = docs.getElementsByTag("tr");
			for (Element docTr : docsTr) {
				Elements docContent = docTr.getElementsByTag("td");
				if (null != docContent && !docContent.isEmpty()) {
					List<String> doctd = new LinkedList<>();
					for (Element dct : docContent) {
						doctd.add(dct.text());
					}
					if (doctd.size() > 1) {
						docList.add(doctd);
					}

				}
			}
		}
		DecompressVo decompressVo = new DecompressVo();
		decompressVo.setBaseInfo(dataMap);
		decompressVo.setAttends(attendList);
		decompressVo.setDocs(docList);
		return decompressVo;
	}
  1. The data stored in the database
public void analysis(HttpSession session, String sessionAttr, String url, String suffix) {
		// 递归文件夹,获取文件家中的html文件
		List<String> paths = TraversalFile.getFileUrlByTraversal(url, suffix);

		if (!CollectionUtils.isEmpty(paths)) {
			int count = 0;
			for (String path : paths) {
				// 百分比进度
				count++;
				double total = paths.size();
				session.setAttribute(sessionAttr, (int) ((count / total) * 100));
				DecompressVo datas = HtmlParser.parseHtml(path);
				/**
				 * {Email=NA, Registered by=admin,
				 * photo=../upload_teaching/f54cb43e0212b28e982891c40642c5ca,
				 * Gender=Female, Name=tenzin dolma, Parent name=dharpo, Home
				 * address=NA, Local address=TIPA BOY HOSTEL, TIPA.. MCLEOD
				 * GANJ, DHARAMSHALA - 1762 19, HP, INDIA, SNO=10 DS,
				 * Remarks=contact no. 9882270774, Phone=NA, Region=Dhasa,
				 * Country=Tibet, Registered date=2016-Dec-13 12:09}
				 * 
				 * 保存到数据库 1. 保存基本信息
				 * 
				 * 2. 保存会议信息
				 * 
				 * 3. 保存文档信息
				 */

				// 获取照片地址

				String photoMd5 = null;
				String photo = datas.getBaseInfo().get("photo");
				if (StringUtils.isNotBlank(photo)) {
					File file = new File(path);
					String dir = file.getParentFile().getParent();
					photo = photo.replace("..", "");
					photo = dir + photo;
					photoMd5 = MD5Utils.encryptToBase64(photo);
				}

				OdmsPeronalOfficealWithBLOBs odOfficeal = new OdmsPeronalOfficealWithBLOBs();
				odOfficeal.setId(UUIDUtils.getUUID() + "_" + datas.getBaseInfo().get("Name"));
				odOfficeal.setEmail(datas.getBaseInfo().get("Email"));
				odOfficeal.setRegisteredby(datas.getBaseInfo().get("Registered by"));
				odOfficeal.setGender(datas.getBaseInfo().get("Gender"));
				odOfficeal.setName(datas.getBaseInfo().get("Name"));
				odOfficeal.setHomeaddress(datas.getBaseInfo().get("Home address"));
				odOfficeal.setLocaladdress(datas.getBaseInfo().get("Local address"));
				odOfficeal.setSno(datas.getBaseInfo().get("SNO"));
				odOfficeal.setRemarks(datas.getBaseInfo().get("Remarks"));
				odOfficeal.setPhone(datas.getBaseInfo().get("Phone"));
				odOfficeal.setRegion(datas.getBaseInfo().get("Region"));
				odOfficeal.setCountry(datas.getBaseInfo().get("Country"));
				odOfficeal.setRegistereddate(datas.getBaseInfo().get("Registered date"));
				odOfficeal.setParentname(datas.getBaseInfo().get("Parent name"));
				odOfficeal.setPhoto(photo);
				odOfficeal.setPhotomd5(photoMd5);
				try {
					peronalOfficealMapper.insertSelective(odOfficeal);
				} catch (Exception e) {
					continue;
				}

				// 会议

				List<List<String>> attends = datas.getAttends();
				if (!CollectionUtils.isEmpty(attends)) {
					for (List<String> list : attends) {
						OdmsTeachingattendance attend = new OdmsTeachingattendance();
						attend.setPersonalid(odOfficeal.getId());
						attend.setTeachingname(list.get(0));
						attend.setCategory(list.get(1));
						attend.setAddedby(list.get(2));
						attend.setRdata(list.get(3));
						attendMapper.insertSelective(attend);
					}
				}

				// 文档

				List<List<String>> docs = datas.getDocs();
				if (!CollectionUtils.isEmpty(docs)) {
					for (List<String> doc : docs) {
						OdmsDocuments data = new OdmsDocuments();

						data.setPersonalid(odOfficeal.getId());
						data.setRdate(doc.get(0));
						data.setCategory(doc.get(2));
						data.setEnteredby(doc.get(1));
						data.setNumber(doc.get(3));
						documentMapper.insertSelective(data);
					}
				}

			}
		}
	}
  1. Gets the index of the specified file type
public static List<String> getFileUrlByTraversal(String dir, String suffix) {
		List<String> paths = new ArrayList<String>();
		paths = getAllFilePaths(new File(dir), paths, suffix);
		logger.info(paths.toString());
		return paths;
	}

	private static List<String> getAllFilePaths(File filePath, List<String> filePaths, String suffix) {
		File[] files = filePath.listFiles();
		if (files == null) {
			return filePaths;
		}
		for (File f : files) {
			if (f.isDirectory()) {
				getAllFilePaths(f, filePaths, suffix);
			} else {
				if (StringUtils.isNotBlank(suffix)) {
					if (f.getName().endsWith(suffix)) {
						String pu = f.getPath();
						filePaths.add(pu);
					}
				} else {
					filePaths.add(f.getPath());
				}

			}
		}
		return filePaths;
	}

Focus is on the use of JSOUP
this ..., parsing completed

Guess you like

Origin blog.csdn.net/qq_32662595/article/details/88189956