前言
此处仅大致的将quartz的逻辑给理出来,有些具体的详细之处我也不是很清楚(苦笑)
QuartzSchedulerThread
quartzSchedulerThread是quartz的主线程,quartz完成任务的触发是完全依靠这个线程来做的。
QuartzSchedulerThread
@Override
public
void
run() {
int
acquiresFailed =
0
;
while
(!halted.get()) {
try
{
// check if we're supposed to pause...
synchronized
(sigLock) {
while
(paused && !halted.get()) {
try
{
// wait until togglePause(false) is called...
sigLock.wait(1000L);
}
catch
(InterruptedException ignore) {
}
// reset failure counter when paused, so that we don't
// wait again after unpausing
acquiresFailed =
0
;
}
if
(halted.get()) {
break
;
}
}
// wait a bit, if reading from job store is consistently
// failing (e.g. DB is down or restarting)..
if
(acquiresFailed >
1
) {
try
{
long
delay = computeDelayForRepeatedErrors(qsRsrcs.getJobStore(), acquiresFailed);
Thread.sleep(delay);
}
catch
(Exception ignore) {
}
}
// 获取工作线程池中可用的线程数量
int
availThreadCount = qsRsrcs.getThreadPool().blockForAvailableThreads();
if
(availThreadCount >
0
) {
// will always be true, due to semantics of blockForAvailableThreads...
List<OperableTrigger> triggers;
long
now = System.currentTimeMillis();
clearSignaledSchedulingChange();
try
{
// idleWaitTime默认是30秒,
// maxBatchSize默认为1 , 每次获取一个Trigger来执行,
// 最后一个参数,默认为0
// 总的来说,这个方法是将最近30秒内最近的一批需要执行的JOB给抓出来,抓取数量为maxBatchSize,同时更新JOB的状态为ACQUIRED
// 同时更新JOB的下次执行时间。 这里如果maxBatchSize等于1 的话,则默认不加悲观锁
// 该方法后面重点讲。
triggers = qsRsrcs.getJobStore().acquireNextTriggers(
now + idleWaitTime, Math.min(availThreadCount, qsRsrcs.getMaxBatchSize()), qsRsrcs.getBatchTimeWindow());
//上一步获取成功将失败标志置为false;
acquiresFailed =
0
;
if
(log.isDebugEnabled())
log.debug(
"batch acquisition of "
+ (triggers ==
null
?
0
: triggers.size()) +
" triggers"
);
}
catch
(JobPersistenceException jpe) {
if
(acquiresFailed ==
0
) {
qs.notifySchedulerListenersError(
"An error occurred while scanning for the next triggers to fire."
,
jpe);
}
// 执行失败
if
(acquiresFailed < Integer.MAX_VALUE)
acquiresFailed++;
continue
;
}
catch
(RuntimeException e) {
if
(acquiresFailed ==
0
) {
getLog().error(
"quartzSchedulerThreadLoop: RuntimeException "
+e.getMessage(), e);
}
if
(acquiresFailed < Integer.MAX_VALUE)
acquiresFailed++;
continue
;
}
// triggers不为空
if
(triggers !=
null
&& !triggers.isEmpty()) {
// 获取
now = System.currentTimeMillis();
// 获取第一个
long
triggerTime = triggers.get(
0
).getNextFireTime().getTime();
//计算距离trigger触发的时间
long
timeUntilTrigger = triggerTime - now;
while
(timeUntilTrigger >
2
) {
synchronized
(sigLock) {
if
(halted.get()) {
break
;
}
if
(!isCandidateNewTimeEarlierWithinReason(triggerTime,
false
)) {
try
{
// we could have blocked a long while
// on 'synchronize', so we must recompute
now = System.currentTimeMillis();
timeUntilTrigger = triggerTime - now;
if
(timeUntilTrigger >=
1
)
sigLock.wait(timeUntilTrigger);
}
catch
(InterruptedException ignore) {
}
}
}
//这里的意思,主要是当有定时器信息发生改变的时候,有个新的JOB更加迫切,这里就会判断是否值得问题
// 如果重新获取新的任务的时间,依赖赶不上新任务的触发时间,那么就继续执行当前的任务。否则放弃任务
// 比如: 下次任务的触发时间是0.1秒后,但是获取任务的时候就需要0.2秒,那么即使去获取了,也没有意义,索性直接执行当前的任务。
if
(releaseIfScheduleChangedSignificantly(triggers, triggerTime)) {
break
;
}
now = System.currentTimeMillis();
timeUntilTrigger = triggerTime - now;
}
// 这里再次做一次非空判断,防止上面那个循环里面,将triggers清空了
// this happens if releaseIfScheduleChangedSignificantly decided to release triggers
if
(triggers.isEmpty())
continue
;
// set triggers to 'executing'
List<TriggerFiredResult> bndles =
new
ArrayList<TriggerFiredResult>();
boolean
goAhead =
true
;
synchronized
(sigLock) {
goAhead = !halted.get();
}
if
(goAhead) {
try
{
// 这个地方是获取trigger的详情信息,并且做一系列的状态判断,防止重复执行,是否是串行还是并行执行,在这里面都有处理
List<TriggerFiredResult> res = qsRsrcs.getJobStore().triggersFired(triggers);
if
(res !=
null
)
// 获取结果
bndles = res;
}
catch
(SchedulerException se) {
qs.notifySchedulerListenersError(
"An error occurred while firing triggers '"
+ triggers +
"'"
, se);
//QTZ-179 : a problem occurred interacting with the triggers from the db
//we release them and loop again
for
(
int
i =
0
; i < triggers.size(); i++) {
qsRsrcs.getJobStore().releaseAcquiredTrigger(triggers.get(i));
}
continue
;
}
}
// 循环trigger的结果信息
for
(
int
i =
0
; i < bndles.size(); i++) {
TriggerFiredResult result = bndles.get(i);
TriggerFiredBundle bndle = result.getTriggerFiredBundle();
Exception exception = result.getException();
// 异常处理
if
(exception
instanceof
RuntimeException) {
getLog().error(
"RuntimeException while firing trigger "
+ triggers.get(i), exception);
qsRsrcs.getJobStore().releaseAcquiredTrigger(triggers.get(i));
continue
;
}
// it's possible to get 'null' if the triggers was paused,
// blocked, or other similar occurrences that prevent it being
// fired at this time... or if the scheduler was shutdown (halted)
// 为空的话,则释放状态绑定,将状态从ACQUIRED修改为WAITING
if
(bndle ==
null
) {
qsRsrcs.getJobStore().releaseAcquiredTrigger(triggers.get(i));
continue
;
}
JobRunShell shell =
null
;
try
{
// 构建任务执行的脚本信息
shell = qsRsrcs.getJobRunShellFactory().createJobRunShell(bndle);
shell.initialize(qs);
}
catch
(SchedulerException se) {
qsRsrcs.getJobStore().triggeredJobComplete(triggers.get(i), bndle.getJobDetail(), CompletedExecutionInstruction.SET_ALL_JOB_TRIGGERS_ERROR);
continue
;
}
// 将任务丢到线程池里面去处理,至此,任务触发算是完成了
if
(qsRsrcs.getThreadPool().runInThread(shell) ==
false
) {
// this case should never happen, as it is indicative of the
// scheduler being shutdown or a bug in the thread pool or
// a thread pool being used concurrently - which the docs
// say not to do...
getLog().error(
"ThreadPool.runInThread() return false!"
);
qsRsrcs.getJobStore().triggeredJobComplete(triggers.get(i), bndle.getJobDetail(), CompletedExecutionInstruction.SET_ALL_JOB_TRIGGERS_ERROR);
}
}
continue
;
// while (!halted)
}
}
else
{
// if(availThreadCount > 0)
// should never happen, if threadPool.blockForAvailableThreads() follows contract
continue
;
// while (!halted)
}
long
now = System.currentTimeMillis();
long
waitTime = now + getRandomizedIdleWaitTime();
long
timeUntilContinue = waitTime - now;
synchronized
(sigLock) {
try
{
if
(!halted.get()) {
// QTZ-336 A job might have been completed in the mean time and we might have
// missed the scheduled changed signal by not waiting for the notify() yet
// Check that before waiting for too long in case this very job needs to be
// scheduled very soon
if
(!isScheduleChanged()) {
sigLock.wait(timeUntilContinue);
}
}
}
catch
(InterruptedException ignore) {
}
}
}
catch
(RuntimeException re) {
getLog().error(
"Runtime error occurred in main trigger firing loop."
, re);
}
}
// while (!halted)
// drop references to scheduler stuff to aid garbage collection...
qs =
null
;
qsRsrcs =
null
;
}
|
JobStoreSupport
任务的存储类,这里面包含了上面提到的两个比较核心的方法
acquireNextTriggers
public
List<OperableTrigger> acquireNextTriggers(
final
long
noLaterThan,
final
int
maxCount,
final
long
timeWindow)
throws
JobPersistenceException {
String lockName;
// 从这个地方可以看到maxCount大于1 的时候才会使用悲观锁, isAcquireTriggersWithinLock默认为false
if
(isAcquireTriggersWithinLock() || maxCount >
1
) {
lockName = LOCK_TRIGGER_ACCESS;
}
else
{
lockName =
null
;
}
return
executeInNonManagedTXLock(lockName,
new
TransactionCallback<List<OperableTrigger>>() {
public
List<OperableTrigger> execute(Connection conn)
throws
JobPersistenceException {
// 重点看这个方法
// executeInNonManagedTXLock 里面最终主要的就是执行这个方法。
return
acquireNextTrigger(conn, noLaterThan, maxCount, timeWindow);
}
},
new
TransactionValidator<List<OperableTrigger>>() {
public
Boolean validate(Connection conn, List<OperableTrigger> result)
throws
JobPersistenceException {
try
{
List<FiredTriggerRecord> acquired = getDelegate().selectInstancesFiredTriggerRecords(conn, getInstanceId());
Set<String> fireInstanceIds =
new
HashSet<String>();
for
(FiredTriggerRecord ft : acquired) {
fireInstanceIds.add(ft.getFireInstanceId());
}
for
(OperableTrigger tr : result) {
if
(fireInstanceIds.contains(tr.getFireInstanceId())) {
return
true
;
}
}
return
false
;
}
catch
(SQLException e) {
throw
new
JobPersistenceException(
"error validating trigger acquisition"
, e);
}
}
});
}
protected
List<OperableTrigger> acquireNextTrigger(Connection conn,
long
noLaterThan,
int
maxCount,
long
timeWindow)
throws
JobPersistenceException {
if
(timeWindow <
0
) {
throw
new
IllegalArgumentException();
}
List<OperableTrigger> acquiredTriggers =
new
ArrayList<OperableTrigger>();
Set<JobKey> acquiredJobKeysForNoConcurrentExec =
new
HashSet<JobKey>();
// 最多重试三次
final
int
MAX_DO_LOOP_RETRY =
3
;
int
currentLoopCount =
0
;
do
{
// 进入do while循环
currentLoopCount ++;
try
{
//通过时间,获取nextFireTime<noLaterThan的trigger
List<TriggerKey> keys = getDelegate().selectTriggerToAcquire(conn, noLaterThan + timeWindow, getMisfireTime(), maxCount);
// No trigger is ready to fire yet.
if
(keys ==
null
|| keys.size() ==
0
)
return
acquiredTriggers;
// 设置截止时间
long
batchEnd = noLaterThan;
for
(TriggerKey triggerKey: keys) {
// If our trigger is no longer available, try a new one.
// 判断 trigger是否存在
OperableTrigger nextTrigger = retrieveTrigger(conn, triggerKey);
if
(nextTrigger ==
null
) {
continue
;
// next trigger
}
// If trigger's job is set as @DisallowConcurrentExecution, and it has already been added to result, then
// put it back into the timeTriggers set and continue to search for next trigger.
JobKey jobKey = nextTrigger.getJobKey();
// 判断trigger对应的jobDetail是否存在
JobDetail job;
try
{
job = retrieveJob(conn, jobKey);
}
catch
(JobPersistenceException jpe) {
try
{
getLog().error(
"Error retrieving job, setting trigger state to ERROR."
, jpe);
getDelegate().updateTriggerState(conn, triggerKey, STATE_ERROR);
}
catch
(SQLException sqle) {
getLog().error(
"Unable to set trigger state to ERROR."
, sqle);
}
continue
;
}
// 是否允许并发执行, JobBean上面含@DisallowConcurrentExecution这个注解的,表示不允许并发执行
if
(job.isConcurrentExectionDisallowed()) {
// 进入这里,表示不允许并发执行
if
(acquiredJobKeysForNoConcurrentExec.contains(jobKey)) {
continue
;
// next trigger
}
else
{
acquiredJobKeysForNoConcurrentExec.add(jobKey);
}
}
// 如果该任务的下次执行时间大于截止时间,那么跳过
if
(nextTrigger.getNextFireTime().getTime() > batchEnd) {
break
;
}
// We now have a acquired trigger, let's add to return list.
// If our trigger was no longer in the expected state, try a new one.
// 更新这个trigger的状态为ACQUIRED ,表示正在准备出发。
int
rowsUpdated = getDelegate().updateTriggerStateFromOtherState(conn, triggerKey, STATE_ACQUIRED, STATE_WAITING);
if
(rowsUpdated <=
0
) {
continue
;
// next trigger
}
nextTrigger.setFireInstanceId(getFiredTriggerRecordId());
// 插入出发记录
getDelegate().insertFiredTrigger(conn, nextTrigger, STATE_ACQUIRED,
null
);
if
(acquiredTriggers.isEmpty()) {
batchEnd = Math.max(nextTrigger.getNextFireTime().getTime(), System.currentTimeMillis()) + timeWindow;
}
// 加入返回trigger
acquiredTriggers.add(nextTrigger);
}
// if we didn't end up with any trigger to fire from that first
// batch, try again for another batch. We allow with a max retry count.
if
(acquiredTriggers.size() ==
0
&& currentLoopCount < MAX_DO_LOOP_RETRY) {
continue
;
}
// We are done with the while loop.
break
;
}
catch
(Exception e) {
throw
new
JobPersistenceException(
"Couldn't acquire next trigger: "
+ e.getMessage(), e);
}
}
while
(
true
);
// Return the acquired trigger list
return
acquiredTriggers;
}
|
上面看到的是触发器的获取详细实现,如果每次获取的maxCount大于1 ,那么就会使用悲观锁,防止任务在集群状态下
被重复获取,默认maxCount=1 , 这也就导致了,在默认的集群模式下,如果不做这个配置,在并发状态下,就会有出现任务
被重复获取,会产生任务被重复触发的情况。
triggersFired
在主线程里面调用如下:
List<TriggerFiredResult> res = qsRsrcs.getJobStore().triggersFired(triggers);
public
List<TriggerFiredResult> triggersFired(
final
List<OperableTrigger> triggers)
throws
JobPersistenceException {
// 直接传入锁名,使用悲观锁
return
executeInNonManagedTXLock(LOCK_TRIGGER_ACCESS,
new
TransactionCallback<List<TriggerFiredResult>>() {
public
List<TriggerFiredResult> execute(Connection conn)
throws
JobPersistenceException {
List<TriggerFiredResult> results =
new
ArrayList<TriggerFiredResult>();
TriggerFiredResult result;
for
(OperableTrigger trigger : triggers) {
try
{
// 单个任务来慢慢搞
TriggerFiredBundle bundle = triggerFired(conn, trigger);
result =
new
TriggerFiredResult(bundle);
}
catch
(JobPersistenceException jpe) {
result =
new
TriggerFiredResult(jpe);
}
catch
(RuntimeException re) {
result =
new
TriggerFiredResult(re);
}
results.add(result);
}
return
results;
}
},
new
TransactionValidator<List<TriggerFiredResult>>() {
@Override
public
Boolean validate(Connection conn, List<TriggerFiredResult> result)
throws
JobPersistenceException {
try
{
List<FiredTriggerRecord> acquired = getDelegate().selectInstancesFiredTriggerRecords(conn, getInstanceId());
Set<String> executingTriggers =
new
HashSet<String>();
for
(FiredTriggerRecord ft : acquired) {
if
(STATE_EXECUTING.equals(ft.getFireInstanceState())) {
executingTriggers.add(ft.getFireInstanceId());
}
}
for
(TriggerFiredResult tr : result) {
if
(tr.getTriggerFiredBundle() !=
null
&& executingTriggers.contains(tr.getTriggerFiredBundle().getTrigger().getFireInstanceId())) {
return
true
;
}
}
return
false
;
}
catch
(SQLException e) {
throw
new
JobPersistenceException(
"error validating trigger acquisition"
, e);
}
}
});
}
protected
TriggerFiredBundle triggerFired(Connection conn,
OperableTrigger trigger)
throws
JobPersistenceException {
JobDetail job;
Calendar cal =
null
;
// Make sure trigger wasn't deleted, paused, or completed...
try
{
// if trigger was deleted, state will be STATE_DELETED
// 验证trigger的状态,如果不是等于ACQUIRED的,则直接return null
String state = getDelegate().selectTriggerState(conn,
trigger.getKey());
if
(!state.equals(STATE_ACQUIRED)) {
return
null
;
}
}
catch
(SQLException e) {
throw
new
JobPersistenceException(
"Couldn't select trigger state: "
+ e.getMessage(), e);
}
try
{
// 获取这个trigger的任务详情。
job = retrieveJob(conn, trigger.getJobKey());
if
(job ==
null
) {
return
null
; }
}
catch
(JobPersistenceException jpe) {
try
{
getLog().error(
"Error retrieving job, setting trigger state to ERROR."
, jpe);
getDelegate().updateTriggerState(conn, trigger.getKey(),
STATE_ERROR);
}
catch
(SQLException sqle) {
getLog().error(
"Unable to set trigger state to ERROR."
, sqle);
}
throw
jpe;
}
if
(trigger.getCalendarName() !=
null
) {
// 这里主要是对非集群模式下做一些缓存处理
cal = retrieveCalendar(conn, trigger.getCalendarName());
if
(cal ==
null
) {
return
null
; }
}
try
{
// 更新触发记录的状态为EXECUTING
getDelegate().updateFiredTrigger(conn, trigger, STATE_EXECUTING, job);
}
catch
(SQLException e) {
throw
new
JobPersistenceException(
"Couldn't insert fired trigger: "
+ e.getMessage(), e);
}
Date prevFireTime = trigger.getPreviousFireTime();
// call triggered - to update the trigger's next-fire-time state...
// 计算下一次的trigger的执行时间
trigger.triggered(cal);
String state = STATE_WAITING;
boolean
force =
true
;
//如果任务是不允许并发执行的,那么需要将任务的状态修改为BLOCK,阻塞
if
(job.isConcurrentExectionDisallowed()) {
state = STATE_BLOCKED;
force =
false
;
try
{
getDelegate().updateTriggerStatesForJobFromOtherState(conn, job.getKey(),
STATE_BLOCKED, STATE_WAITING);
getDelegate().updateTriggerStatesForJobFromOtherState(conn, job.getKey(),
STATE_BLOCKED, STATE_ACQUIRED);
getDelegate().updateTriggerStatesForJobFromOtherState(conn, job.getKey(),
STATE_PAUSED_BLOCKED, STATE_PAUSED);
}
catch
(SQLException e) {
throw
new
JobPersistenceException(
"Couldn't update states of blocked triggers: "
+ e.getMessage(), e);
}
}
if
(trigger.getNextFireTime() ==
null
) {
// 下次执行时间为空,也就是说没有下次了,直接修改trigger的状态为完成
state = STATE_COMPLETE;
force =
true
;
}
// 修改trigger的撞他信息
storeTrigger(conn, trigger, job,
true
, state, force,
false
);
job.getJobDataMap().clearDirtyFlag();
// 返回任务的执行信息
return
new
TriggerFiredBundle(job, trigger, cal, trigger.getKey().getGroup()
.equals(Scheduler.DEFAULT_RECOVERY_GROUP),
new
Date(), trigger
.getPreviousFireTime(), prevFireTime, trigger.getNextFireTime());
|
该方法做了以下工作:
1.获取trigger当前状态
2.通过trigger中的JobKey读取trigger包含的Job信息
3.将trigger更新至触发状态
4.更新数据库中trigger的信息,包括更改状态至STATE_COMPLETE,及计算下一次触发时间.
5.返回trigger触发结果的数据传输类TriggerFiredBundle
从该方法返回后,trigger的执行过程已基本完毕.回到执行quratz操作规范的executeInNonManagedTXLock方法,将数据库锁释放.
trigger触发操作完成
总结:
简单地说,quartz的分布式调度策略是以数据库为边界资源的一种异步策略.各个调度器都遵守一个基于数据库锁的操作规则保证了操作的唯一性.
同时多个节点的异步运行保证了服务的可靠.但这种策略有自己的局限性,集群特性对于高cpu使用率的任务效果很好,但是对于大量的短任务,
各个节点都会抢占数据库锁,这样就出现大量的线程等待资源.这种情况随着节点的增加会越来越严重.