使用jsoup爬取网页信息,保存到txt中

首先建立maven项目,导入相关的jar包

pom

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.text.jsoup</groupId>
  <artifactId>com.text.jsoup</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  <dependencies>
  			<dependency>
			    <groupId>org.apache.httpcomponents</groupId>
			    <artifactId>httpclient</artifactId>
			    <version>4.5.2</version>
			</dependency>
  			<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
			<dependency>
			    <groupId>org.jsoup</groupId>
			    <artifactId>jsoup</artifactId>
			    <version>1.11.3</version>
			</dependency>
  			
  </dependencies>
  
//以下是打包所使用的
  <build>
                <finalName>import_tool</finalName>
         <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-dependency-plugin</artifactId>
                <executions>
                    <execution>
                        <id>copy</id>
                        <phase>install</phase>
                        <goals>
                            <goal>copy-dependencies</goal>
                        </goals>
                        <configuration>
                            <outputDirectory>${project.build.directory}/lib</outputDirectory>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <!--直接将所有依赖一起打包-->
           <plugin>  
            <groupId>org.apache.maven.plugins</groupId>  
            <artifactId>maven-assembly-plugin</artifactId>  
            <version>2.5.5</version>  
            <configuration>  
                <archive>  
                    <manifest>  
                        <mainClass>com.text.jsoup.ThreadTest</mainClass>  
                    </manifest>  
                </archive>  
                <descriptorRefs>  
                <!--打包后缀名称-->
                    <descriptorRef>jar-with-dependencies</descriptorRef>  
                </descriptorRefs>  
            </configuration>  
        </plugin>  
        </plugins>
       </build> 
</project>

爬取过程

这个业务是在某一个网站上爬取到某个class的所有的值,然后保存到date.txt 中,这里我使用的是多线程

package com.text.jsoup;


import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class ThreadTest implements Runnable{
	 
	 
    int i = FileUtil.getI();
 
    public static void main(String[] args) {
    	ThreadTest t = new ThreadTest();
       for (int j = 0; j < 100; j++) {
    	   Thread t1 = new Thread(t);
    	   t1.setName("线程"+j);
    	   t1.start();
       }
    }
    
    
	
 
    public void run(){
        while(true){
            synchronized(this){
                notify();
                try{
                    Thread.sleep(10);
                }catch (Exception e){
                    e.printStackTrace();
                }
                if(i <= 30000000){
                    
                    CloseableHttpClient httpClient = HttpClients.createDefault();
					HttpGet httpGet = new HttpGet("http://www.okooo.com/member/"+i);
					CloseableHttpResponse response = null;
					try {
						response = httpClient.execute(httpGet);
					} catch (Exception e1) {
						e1.printStackTrace();
					}//执行get请求
					HttpEntity httpEntity = response.getEntity();
						String string = null;
						try {
							string = EntityUtils.toString(httpEntity, "utf-8");
						} catch (Exception e1) {
							e1.printStackTrace();
						} 
						Document parse = Jsoup.parse(string);
						Elements elementsByTag = parse.getElementsByClass("xxx");
						Element element = null;
						try {
							 element = elementsByTag.get(0);
						} catch (Exception e) {
						}
						String text = element.text();
						String fileName = "/home/date.txt";
						FileUtil.saveTxt(fileName, i+":"+text);
//						System.out.println(i+":"+text);
						try {
							response.close();
							httpClient.close();
						} catch (Exception e1) {
							e1.printStackTrace();
						}
                    
                    i++;
                    try{
                        wait();
                    }catch (InterruptedException e ){
                        e.printStackTrace();
                    }
                }
            }
        }
    }
    
 
}

部分工具类

package com.text.jsoup;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;

public class FileUtil {
/**
读取最后一行的值
*/

	public static String readLastLine(File file, String charset) throws IOException {
        if (!file.exists() || file.isDirectory() || !file.canRead()) {
            return null;
        }
        RandomAccessFile raf = null;
        try {
            raf = new RandomAccessFile(file, "r");
            long len = raf.length();
            if (len == 0L) {
                return "";
            } else {
                long pos = len - 1;
                while (pos > 0) {
                    pos--;
                    raf.seek(pos);
                    if (raf.readByte() == '\n') {
                        break;
                    }
                }
                if (pos == 0) {
                    raf.seek(0);
                }
                byte[] bytes = new byte[(int) (len - pos)];
                raf.read(bytes);
                if (charset == null) {
                    return new String(bytes);
                } else {
                    return new String(bytes, charset);
                }
            }
        } catch (FileNotFoundException e) {
        } finally {
            if (raf != null) {
                try {
                    raf.close();
                } catch (Exception e2) {
                }
            }
        }
        return null;
    }
	
/**
保存文件并换行

*/
	 public static void saveTxt(String fileName, String content) {
			try {
				// 打开一个随机访问文件流,按读写方式
				RandomAccessFile randomFile = new RandomAccessFile(fileName, "rw");
				// 文件长度,字节数
				long fileLength = randomFile.length();
				// 将写文件指针移到文件尾。
				randomFile.seek(fileLength);
				randomFile.write((content + "\r\n").getBytes());
				randomFile.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
	    }
	 
/**
这个是为了如果网络断掉,可以读取到上次保存的最后一行继续往下写入
*/	 

public static int getI(){
		 String readLastLine = null;
		try {
			readLastLine = FileUtil.readLastLine(new File("/home/date.txt"), "utf-8");
		} catch (IOException e) {
			e.printStackTrace();
		}
			if (readLastLine != null) {
				String[] split = readLastLine.split(":");
				int parseInt = Integer.parseInt(split[0]);
//				System.out.println(parseInt);
				return parseInt + 1;
			}
		return 0;
		 
	 }
	 
}

该项目可以直接打包成可执行jar,然后运行即可,效率方面一般,还望路过的大神指点

猜你喜欢

转载自blog.csdn.net/qq_38058332/article/details/88953009