Self-encapsulated crawler base class.
public interface TaskBaseInfo {
/**
* Returns the name of the task.
* <br/>
* Generally used as log output
* @return
*/
String taskName();
/**
* Return the unique code of the task
* @return Code value that is not repeated in the entire crawler project
*/
String taskCode();
}
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.JedisCluster;
public interface TaskStringCache {
Logger logger = LoggerFactory.getLogger(TaskStringCache.class);
String BASE_FILE_PATH = "/mfs/ShareFile/static/cms/crawler/cache/";
JedisCluster obtainJedisCluster();
String getCacheStr(String taskCode, String cacheKey);
void setCacheStr(String taskCode, String cacheKey, int cacheSeconds, String cacheStr);
default String obtainTargetFilePath(String taskCode, String cacheKey) {
return BASE_FILE_PATH + taskCode + File.pathSeparator + cacheKey + ".properties";
}
/**
* Set the default method for caching
* @param taskName task Chinese name, log use
* @param taskCode task code must be unique
* @param cacheKey cache key
* @param cacheStr cached value
*/
default void defaultSetCacheStr(String taskName, String taskCode, String cacheKey, int cacheSeconds, String cacheStr) {
JedisCluster jedisCluster = obtainJedisCluster();
jedisCluster.setex(cacheKey, cacheSeconds, cacheStr);
String targetFilePath = obtainTargetFilePath(taskCode, cacheKey);
save2FileAtomic(taskName, targetFilePath, cacheStr);
}
/**
* Get the cache set by [default method of setting cache] {@link #defaultSetCacheStr(String, String, String, String)}
* @param taskName task Chinese name, log use
* @param taskCode task code must be unique
* @param cacheKey cache key
* @return
*/
default String defaultGetCacheStr(String taskName, String taskCode, String cacheKey) {
JedisCluster jedisCluster = obtainJedisCluster();
String cacheStr = jedisCluster.get(cacheKey);
if (StringUtils.isNotBlank(cacheStr)) {
return cacheStr;
}
String targetFilePath = obtainTargetFilePath(taskCode, cacheKey);
try {
// It doesn't take advantage of a lot of asynchrony, and getting the result immediately after executing the asynchronous operation will still block
cacheStr = readFile(targetFilePath).get();
} catch (InterruptedException | ExecutionException e) {
logger.error("[" + taskName + "] Failed to get the file cache content asynchronously. taskCode=>" + "[" + taskCode + "]" + " cacheKey=>" + "[" + cacheKey + "] ");
logger.error(e.getMessage());
}
return cacheStr;
}
/**
* Persistently crawled cursor IDs through files to avoid adding fields to the data
* File write operations are slow and executed asynchronously
* Atomic operation to avoid concurrency problems of writing and reading
*
* @param filePath
* @return
*/
default void save2FileAtomic(String taskName, String filePath, String content) {
CompletableFuture.runAsync(() -> {
File tmpFile = new File(filePath + ".tmp");
try {
if (tmpFile.exists() == false) {
tmpFile.getParentFile().mkdirs();
tmpFile.createNewFile();
}
try (FileWriter fw = new FileWriter(tmpFile)) {
fw.write(content);
fw.flush();
}
} catch (IOException e) {
logger.error("[" + taskName + "] => write cache string to file => [" + tmpFile + "] exception \n" + e.getMessage());
logger.error("[" + taskName + "] file write operation exited");
if (tmpFile.exists()) {
tmpFile.delete();
}
return;
}
if (tmpFile.exists() == false) {
return;
}
// This comment is for the Windows system already has a target file in the same file system and under the same drive letter;
// The following renameTo operation will fail, resulting in an infinite recursive call and a [stack overflow] exception
// In the case of Linux running, it can be commented out temporarily, and the test will go online after no problem
// start of comment
// File destFile = new File(filePath);
// if (destFile.exists()) {
// destFile.delete();
// }
// end of comment
if (tmpFile.renameTo(new File(filePath))) {
tmpFile.delete();
} else {
logger.error("move fails filePath:" + filePath);
tmpFile.delete();
this.save2FileAtomic(taskName, filePath, content);
// When testing under a Linux distribution, if the renameTo operation is wrong, atomic operations are not required.
// You can comment out the atomic operation above and replace it with the following operation
// save2File(filePath, content);
}
});
}
// default void save2File(String filePath, String content) throws IOException {
//
// try (FileWriter fw = new FileWriter(new File(filePath))) {
//
// fw.write(content);
// fw.flush();
// }
// }
/**
* Read the file content asynchronously
*
* @param filePath
* @return
* @throws IOException
* @throws FileNotFoundException
*/
default CompletableFuture<String> readFile(String filePath) {
return CompletableFuture.supplyAsync(() -> {
StringBuilder strb = new StringBuilder();
try (FileInputStream fis = new FileInputStream(filePath);
BufferedReader inReader = new BufferedReader(new InputStreamReader(fis));) {
String line = StringUtils.EMPTY;
while ((line = inReader.readLine()) != null) {
strb.append(line);
}
} catch (IOException e) {
logger.error(e.getMessage());
return StringUtils.EMPTY;
}
return strb.toString();
});
}
}
public interface BasicTask {
void run();
}
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import com.xxx.zx.crawler.basic.BasicTask;
public abstract class BaseCrawlerTask implements TaskBaseInfo, TaskStringCache, BasicTask, ApplicationContextAware {
protected final Logger logger = LoggerFactory.getLogger(getClass());
protected static ApplicationContext ac;
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
ac = applicationContext;
}
public synchronized static <T> T getBean(Class<T> beanClass) {
return ac.getBean(beanClass);
}
public synchronized static Object getBean(String beanName) {
return ac.getBean (beanName);
}
@Override
public String getCacheStr(String taskCode, String cacheKey) {
return defaultGetCacheStr(taskName(), taskCode, cacheKey);
}
@Override
public void setCacheStr(String taskCode, String cacheKey, int cacheSeconds, String cacheStr) {
defaultSetCacheStr(taskName(), taskCode, cacheKey, cacheSeconds, cacheStr);
}
}