昨天下午被叫做一个爬虫,爬取某个网站的律师信息,emmmmm,所以就记录一下,方便以后使用
用的是比较老的方法了,作为初学者嘛!有专门做这个的框架Jsoup.效率应该更高,不过把这个弄懂了的话,感觉自己也能写一个类似的框架了,无非就是接下一下html,哪个span或者哪个label对应取值.O(∩_∩)O哈哈~,当然还有效率问题.
package reptile; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.RandomAccessFile; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.nio.channels.FileChannel; import java.nio.channels.FileLock; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.omg.CORBA.INTERNAL; public class Reptile { public static void main(String[] args) throws Exception { //5900 //String url = "http://credit.lawyerpass.com/lawyer-list.jsp?q=&type=lawyer&x=110&y=18&page=4"; // 定义一个字符串用来存储网页内容 /* FileWriter fw = new FileWriter(file,true); BufferedWriter bw = new BufferedWriter(fw);*/ String fileName = "D:\\test\\lawyerInfo.txt"; File file = new File(fileName); if(!file.exists()) file.createNewFile(); String targetStr = "id=[0-9a-zA-Z]{32}";//id匹配 //待发出5900个请求 AtomicInteger atoI = new AtomicInteger(1); //容量上限为50的线程池 ExecutorService es = Executors.newFixedThreadPool(50); int taskNum = 0; while(taskNum<=5900){ Runnable task = new Runnable() { @Override public void run() { String url = "http://credit.lawyerpass.com/lawyer-list.jsp?q=&type=lawyer&x=110&y=18&page="+atoI.getAndIncrement(); List<String> result; try { result = getPage(url); lawyerInfo(targetStr, result,file); } catch (IOException e) { e.printStackTrace(); } } }; es.submit(task); taskNum++; } } //NIO非阻塞式读写 public static void writeByNIO(String content,File file) { RandomAccessFile fout = null; FileChannel fcout = null; try { fout = new RandomAccessFile(file, "rw"); long filelength = fout.length();//获取文件的长度 fout.seek(filelength);//将文件的读写指针定位到文件的末尾 fcout = fout.getChannel();//打开文件通道 FileLock flout = null; while (true) { try { flout = fcout.tryLock();//不断的请求锁,如果请求不到,等一秒再请求 break; } catch (Exception e) { System.out.print("lock is exist ......"); Thread.currentThread().sleep(1000); } } fout.write(content.getBytes());//将需要写入的内容写入文件 flout.release(); fcout.close(); fout.close(); } catch (IOException e1) { e1.printStackTrace(); System.out.print("file no find ..."); } catch (InterruptedException e) { e.printStackTrace(); } finally { if (fcout != null) { try { fcout.close(); } catch (IOException e) { e.printStackTrace(); fcout = null; } } if (fout != null) { try { fout.close(); } catch (IOException e) { e.printStackTrace(); fout = null; } } } } private static void lawyerInfo(String targetStr, List<String> result,File file) throws MalformedURLException, IOException { for(String str:result){ String regexString = RegexString( targetStr, str); if(regexString!=null){ String uuid = regexString.substring(3); String urlInfo = "http://credit.lawyerpass.com/lawyer.jsp?id="; List<String> page = getPage(urlInfo+uuid); String pageStr = ""; for(String str1:page){ pageStr += str1; } //System.out.println(pageStr.replaceAll("\"","\'")); //姓名 String nameReg = "信用主体:<span>.*</span>信用编号"; //执业证号 String zhiyeReg = "执业证号:</label>.*</li>"; //性别 String sexReg = "性别:</label>.*</li>"; //年龄 String ageReg = "年龄:</label>.*</li>"; //民族 String mingzuReg = "民族:</label>.*</li> "; //邮箱 String emailReg = "email:</label>.*</li>"; //学历 String xueliReg = "学历:</label>.*</li>"; //执业类型 String typeReg = "执业类型:</label>.*</li>"; //政治面貌 String politicalReg = "政治面貌:</label>.*<i class"; //资格证号 String compentencyNumReg = "资格证号:</label>.*</li>"; //所内身份 String lawyerInnerTypeReg = "所内身份:</label>.*</li>"; //主司法公司 String companyReg = "主管司法局:</label>.*</li>"; StringBuilder sb = new StringBuilder(); String name = nameHandle(RegexString(nameReg,pageStr)); if("".equals(name)) name="暂无"; String zhiye = handle(RegexString(zhiyeReg,pageStr)); if("".equals(zhiye)) zhiye="暂无"; String sex = handle(RegexString(sexReg,pageStr)); if("".equals(sex)) sex="暂无"; String age = handle(RegexString(ageReg,pageStr)); if("".equals(age)) age="暂无"; String email = handle(RegexString(emailReg,pageStr)); if("".equals(email)) email="暂无"; String mingzu = handle(RegexString(mingzuReg,pageStr)); if("".equals(mingzu)) mingzu="暂无"; String xueli = handle(RegexString(xueliReg,pageStr)); if("".equals(xueli)) xueli="暂无"; String type = handle(RegexString(typeReg,pageStr)); if("".equals(type)) type="暂无"; String political = politicalHandle(RegexString(politicalReg,pageStr)); if("".equals(political)) political="暂无"; String compentencyNum = handle(RegexString(compentencyNumReg,pageStr)); if("".equals(compentencyNum)) compentencyNum="暂无"; String lawyerInnerType = handle(RegexString(lawyerInnerTypeReg,pageStr)); if("".equals(lawyerInnerType)) lawyerInnerType="暂无"; String company = handle(RegexString(companyReg,pageStr)); if("".equals(company)) company="暂无"; sb.append(name).append(" ").append(zhiye).append(" ").append(sex).append(" ").append(age).append(" ") .append(mingzu).append(" ").append(xueli).append(" ").append(type).append(" ").append(political).append(" ") .append(compentencyNum).append(" ").append(lawyerInnerType).append(" ").append(company).append(" ").append(email).append("\r\n"); System.out.println(sb.toString()); //写入文件 writeByNIO(sb.toString(), file); } } } private static String politicalHandle(String object){ int begin = object.indexOf("</label>")+8; int end = object.indexOf("<i class"); return object.substring(begin,end); } private static String handle(String object){ int begin = object.indexOf("</label>")+8; int end = object.indexOf("</li>"); return object.substring(begin,end); } private static String nameHandle(String object){ int begin = object.indexOf("<span>")+6; int end = object.indexOf("</span>"); return object.substring(begin,end); } private static List<String> getPage(String url) throws MalformedURLException, IOException { String result = ""; // 定义一个缓冲字符输入流 BufferedReader in = null; // 将string转成url对象 URL realUrl = new URL(url); // 初始化一个链接到那个url的连接 URLConnection connection = realUrl.openConnection(); // 开始实际的连接 connection.connect(); // 初始化 BufferedReader输入流来读取URL的响应 in = new BufferedReader(new InputStreamReader(connection.getInputStream())); List<String> list = new ArrayList<String>(); String line = null; while((line = in.readLine())!=null){ String str = line.toString(); list.add(str); } return list; } static String RegexString(String targetStr, String patternStr) { // 定义一个样式模板,此中使用正则表达式,括号中是要抓的内容 // 相当于埋好了陷阱匹配的地方就会掉下去 Pattern pattern = Pattern.compile(targetStr); // 定义一个matcher用来做匹配 Matcher matcher = pattern.matcher(patternStr); // 如果找到了 if (matcher.find()) { // 打印出结果 String group = matcher.group(0); return group.toString(); } return null; } }
本来爬取的速度很慢,所以加入了多线程,运用了AtomicInteger的CAS操作在不需要加锁的情况下保证多线程数据的安全性.使用NIO非阻塞读写代替了普通的IO阻塞读写,提高了爬取数据效率.
还有值得改进的地方就是字符串匹配算法,现在用的是JDK的matcher和pattern.如果数据量更大的话可以修改JVM配置,提高JVM的性能.
这个爬虫复制下来可以直接爬取某个网站的律师数据→_→!!!!!
参考资料: 点击打开链接