项目准备
1.首先创建一个Maven项目,建立对应的resources文件夹:在src的main和test文件夹下分别创建resources文件夹,然后右键项目名->Build Path->Configure
build path->Add Folder->在resources前面打钩OK即可。
2.本项目使用到日志功能,所以需要在pom.xml中新增依赖:
<dependencies>
<dependency>
<groupId>edu.uci.ics</groupId>
<artifactId>crawler4j</artifactId>
<version>4.2</version>
</dependency>
<!-- 加入log4j支持 -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<!-- 加入slf4j log4j驱动类 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.21</version>
</dependency>
然后我们在src/main/resources下创建一个log4j.properties文件。(右键项目名,New->File,直接键入该名称即可)内容如下:
log4j.rootLogger = debug,D,E
### debug ###
log4j.appender.D = org.apache.log4j.DailyRollingFileAppender
log4j.appender.D.File = c://logs/log.log
log4j.appender.D.Append = true
log4j.appender.D.Threshold = DEBUG
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
### error ###
log4j.appender.E = org.apache.log4j.DailyRollingFileAppender
log4j.appender.E.File =c://logs/error.log
log4j.appender.E.Append = true
log4j.appender.E.Threshold = ERROR
log4j.appender.E.layout = org.apache.log4j.PatternLayout
log4j.appender.E.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ]
对应的log4j配置详解的链接为:https://blog.csdn.net/Dr_Guo/article/details/50718063
代码详解
首先创建一个ImageCrawlerControl类用来控制爬虫的网址和文件存储位置等。
代码如下:
package imageCrawler;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class ImageCrawlerControl {
public static void main(String[] args)throws Exception {
String Rootstorage="E:/Crawler/shuju"; //定义爬虫数据存储位置
int numberofcrawler=7;
String FileStorage="E:/Crawler/tupian";
CrawlConfig config=new CrawlConfig(); //初始化爬虫设置
config.setCrawlStorageFolder(Rootstorage);//设置文件爬取位置
//设置爬取二进制true
config.setIncludeBinaryContentInCrawling(true);
//设置爬取的域名
String[] crawlerDomains={"http://www.iii335.com/"};
//实例化页面获取器
PageFetcher pageFetcher=new PageFetcher(config);
//实例化爬虫机器人
RobotstxtConfig robotstxtConfig=new RobotstxtConfig();
//实例化机器人对目标服务器的设置
RobotstxtServer robotstxtServer=new RobotstxtServer(robotstxtConfig, pageFetcher);
//实例化爬虫控制器
CrawlController controller=new CrawlController(config, pageFetcher, robotstxtServer);
//为控制器添加爬取域名
for(String domain:crawlerDomains) {
controller.addSeed(domain);
}
//配置爬虫域名,及文件存储位置
ImageCrawler.config(crawlerDomains,FileStorage);
//从ImageCrawler类启动爬虫
controller.start(ImageCrawler.class, numberofcrawler);
}
}
然后再写一个ImageCrawler类,代码如下:
package imageCrawler;
import java.io.File;
import java.io.IOException;
import com.google.common.io.Files;
//必须引用该文件jar包,否则下方files.write会报错
import java.util.UUID;
import java.util.regex.Pattern;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.BinaryParseData;
import edu.uci.ics.crawler4j.url.WebURL;
public class ImageCrawler extends WebCrawler{
//过滤选项
private static final Pattern filters = Pattern
.compile(".*(\\.(css|js|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
//爬取图片的格式
private static final Pattern imgPatterns = Pattern.compile(".*(\\.(bmp|gif|jpe?g|png|tiff?))$");
private static File Storagefile;
private static String[] CrawlerDomains;
//配置爬取域名,以及存储位置
public static void config(String[] domains,String storagefilename) {
CrawlerDomains=domains;
//实例化文件对象
Storagefile=new File(storagefilename);
if(!Storagefile.exists()) { //若文件不存在则进行创建
Storagefile.mkdirs();
}
}
//筛选爬取条件
public boolean shouldvisit(Page refferingpage,WebURL url ) {
//获取小写的url;
String href=url.getURL().toLowerCase();
//matches()正则匹配所有字符串
if(filters.matcher(href).matches()) {
return false;
}
if(imgPatterns.matcher(href).matches()) {
return true;
}
for(String Domain:CrawlerDomains) {
if(href.startsWith(Domain));
return true;
}
return false;
}
public void visit(Page page) {
String url=page.getWebURL().getURL();
//若不满足正则表达式,不是二进制(不是BinaryParseData的实例),图片小于10k
if(!imgPatterns.matcher(url).matches()|| !(page.getParseData() instanceof BinaryParseData)
||(page.getContentData().length<(10*1024))) {
return ;
}
String extension = url.substring(url.lastIndexOf('.'));
String imagename = UUID.randomUUID() + extension;
String fileName = Storagefile.getAbsolutePath() + "/" +imagename;
try {
Files.write(page.getContentData(), new File(fileName)); // 把爬取到的文件存储到指定文件
System.out.println("爬取的url为:"+url);
}catch (IOException exo) {
exo.printStackTrace();
}
}
}
窒息的问题:
1.关于项目名上有一个黄色感叹号:
解决措施:右键项目名->Build path->configure build path->remove JRE System library->然后重新添加该库->workspace default jre(sqls)即可。
2.关于运行程序直接debug到fetchResult.discardContentIfNotConsumed();以及控制台显示:
Exception in thread “Crawler 2” java.lang.NoClassDefFoundError: com/google/common/io/Files
解决措施:下载guava的jar包然后添加到项目工程中
此处给一个链接:
链接:https://pan.baidu.com/s/1zB5Ktd72cXQgGOErJIJSoA
提取码:t0wm
3.自己又编写了简易模型:
其中出现了一些小问题:
(1)关于JTextArea的setText,无法及时更新的现象。
解决方法:采用另写线程法解决。
class Data {
public Data(){
} //假设数据为一个自增数,不断输出数据
public void outData(String url){
JLabelCrawler.x1.append("爬取网址为:"+url+"\n"); //调用主窗口的jtextarea进行添加数据,显示
try {
Thread.currentThread().sleep(100);//让当前的进程睡眠若干毫秒,更加显示出动态更新效果,当然这将耗时
} catch (InterruptedException ex) { //捕获中断异常}
ex.printStackTrace();
}
}
}
(2)附上swing代码:
public static int prepared =0;
public static int shutdown =0;
public static int crawlerClose =0;
public String url,depth,storageFile;
private static final long serialVersionUID = 1L;
JTextField t1,t2;
JLabel l1,l2;
JPanel j1,j2,j3;
JButton b1,b2,b3;
public static JTextArea x1;
JScrollPane pane;
// public static void main(String[] args) {
// new JLabelCrawler();
// }
public JLabelCrawler(){
//ImageIcon ii=new ImageIcon("E:/程序备用文件夹/壁纸/p1.jpg");
this.setLayout(null);
this.setBounds(0, 0, 500, 500);
x1 = new JTextArea(10,10);
pane = new JScrollPane(x1);
x1.setText("以下内容为爬取到的图片:"+"\n");
j1 =new JPanel();
j2 =new JPanel();
j3 =new JPanel();
l1= new JLabel("存储路径:"); //标签
l2= new JLabel("爬取网址:");
t1 = new JTextField(10); //文本域
t2 = new JTextField(10);
b1 = new JButton("指向");
b1.addActionListener(new ActionListener() {
@Override
public void actionPerformed(ActionEvent e) {
// TODO Auto-generated method stub
MethodA();
}
});
b2 = new JButton("确认");
b2.addActionListener(new ActionListener() {
@Override
public void actionPerformed(ActionEvent e) {
// TODO Auto-generated method stub
MethodB();
prepared=1;
}
});
b3 = new JButton("停止");
b3.addActionListener(new ActionListener() {
@Override
public void actionPerformed(ActionEvent e) {
// TODO Auto-generated method stub
MethodC();
}
});
j1.add(l1);
j1.add(t1);
j1.add(b1);
this.add(j1);
j1.setBounds(100,50,300,30);
j2.add(l2);
j2.add(t2);
this.add(j2);
j2.setBounds(100,90,235,30);
this.add(b2);
b2.setBounds(170,140,60,30);
this.add(b3);
b3.setBounds(270,140,60,30);
this.add(x1);
x1.setBounds(50,200,400,230);
this.add(j3);
this.add(pane);
pane.setBounds(50,200,400,230);
pane.setViewportView(x1);
this.setResizable(false);//窗体大小不能改变
this.setLocationRelativeTo(null);
this.setTitle("爬起来,宝贝儿");
this.setSize(500, 500);
validate();
this.setDefaultCloseOperation(EXIT_ON_CLOSE);
this.setVisible(true);
while(t2.getText()==null||t1.getText()==null||prepared==0) { };
//若t1,t2文本为空或按钮未触发则等待
}
最后附上源码:…