|
@@ -48,7 +48,9 @@ import org.apache.commons.lang.StringUtils;
|
|
|
|
|
|
import java.util.Collections;
|
|
import java.util.Collections;
|
|
import java.util.Date;
|
|
import java.util.Date;
|
|
|
|
+import java.util.HashMap;
|
|
import java.util.List;
|
|
import java.util.List;
|
|
|
|
+import java.util.Map;
|
|
import java.util.concurrent.Executors;
|
|
import java.util.concurrent.Executors;
|
|
import java.util.concurrent.ScheduledExecutorService;
|
|
import java.util.concurrent.ScheduledExecutorService;
|
|
import java.util.concurrent.TimeUnit;
|
|
import java.util.concurrent.TimeUnit;
|
|
@@ -194,7 +196,7 @@ public class MasterRegistryClient {
|
|
failoverMaster(serverHost);
|
|
failoverMaster(serverHost);
|
|
break;
|
|
break;
|
|
case WORKER:
|
|
case WORKER:
|
|
- failoverWorker(serverHost, true, true);
|
|
|
|
|
|
+ failoverWorker(serverHost);
|
|
break;
|
|
break;
|
|
default:
|
|
default:
|
|
break;
|
|
break;
|
|
@@ -275,79 +277,114 @@ public class MasterRegistryClient {
|
|
* 3. failover all tasks when workerHost is null
|
|
* 3. failover all tasks when workerHost is null
|
|
*
|
|
*
|
|
* @param workerHost worker host
|
|
* @param workerHost worker host
|
|
- * @param needCheckWorkerAlive need check worker alive
|
|
|
|
- * @param checkOwner need check process instance owner
|
|
|
|
*/
|
|
*/
|
|
- private void failoverWorker(String workerHost, boolean needCheckWorkerAlive, boolean checkOwner) {
|
|
|
|
- logger.info("start worker[{}] failover ...", workerHost);
|
|
|
|
|
|
+ private void failoverWorker(String workerHost) {
|
|
|
|
+ if (StringUtils.isEmpty(workerHost)) {
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ long startTime = System.currentTimeMillis();
|
|
List<TaskInstance> needFailoverTaskInstanceList = processService.queryNeedFailoverTaskInstances(workerHost);
|
|
List<TaskInstance> needFailoverTaskInstanceList = processService.queryNeedFailoverTaskInstances(workerHost);
|
|
- for (TaskInstance taskInstance : needFailoverTaskInstanceList) {
|
|
|
|
- if (needCheckWorkerAlive) {
|
|
|
|
- if (!checkTaskInstanceNeedFailover(taskInstance)) {
|
|
|
|
- continue;
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+ Map<Integer, ProcessInstance> processInstanceCacheMap = new HashMap<>();
|
|
|
|
+ logger.info("start worker[{}] failover, task list size:{}", workerHost, needFailoverTaskInstanceList.size());
|
|
|
|
|
|
- ProcessInstance processInstance = processService.findProcessInstanceDetailById(taskInstance.getProcessInstanceId());
|
|
|
|
- if (workerHost == null
|
|
|
|
- || !checkOwner
|
|
|
|
- || processInstance.getHost().equalsIgnoreCase(getLocalAddress())) {
|
|
|
|
- // only failover the task owned myself if worker down.
|
|
|
|
|
|
+ for (TaskInstance taskInstance : needFailoverTaskInstanceList) {
|
|
|
|
+ ProcessInstance processInstance = processInstanceCacheMap.get(taskInstance.getProcessInstanceId());
|
|
|
|
+ if (processInstance == null) {
|
|
|
|
+ processInstance = processService.findProcessInstanceDetailById(taskInstance.getProcessInstanceId());
|
|
if (processInstance == null) {
|
|
if (processInstance == null) {
|
|
- logger.error("failover error, the process {} of task {} do not exists.",
|
|
|
|
|
|
+ logger.error("failover task instance error, processInstance {} of taskInstance {} is null",
|
|
taskInstance.getProcessInstanceId(), taskInstance.getId());
|
|
taskInstance.getProcessInstanceId(), taskInstance.getId());
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
- taskInstance.setProcessInstance(processInstance);
|
|
|
|
-
|
|
|
|
- TaskExecutionContext taskExecutionContext = TaskExecutionContextBuilder.get()
|
|
|
|
- .buildTaskInstanceRelatedInfo(taskInstance)
|
|
|
|
- .buildProcessInstanceRelatedInfo(processInstance)
|
|
|
|
- .create();
|
|
|
|
- // only kill yarn job if exists , the local thread has exited
|
|
|
|
- ProcessUtils.killYarnJob(taskExecutionContext);
|
|
|
|
-
|
|
|
|
- taskInstance.setState(ExecutionStatus.NEED_FAULT_TOLERANCE);
|
|
|
|
- processService.saveTaskInstance(taskInstance);
|
|
|
|
-
|
|
|
|
- if (!processInstanceExecCacheManager.contains(processInstance.getId())) {
|
|
|
|
- continue;
|
|
|
|
- }
|
|
|
|
- WorkflowExecuteThread workflowExecuteThreadNotify = processInstanceExecCacheManager.getByProcessInstanceId(processInstance.getId());
|
|
|
|
- StateEvent stateEvent = new StateEvent();
|
|
|
|
- stateEvent.setTaskInstanceId(taskInstance.getId());
|
|
|
|
- stateEvent.setType(StateEventType.TASK_STATE_CHANGE);
|
|
|
|
- stateEvent.setProcessInstanceId(processInstance.getId());
|
|
|
|
- stateEvent.setExecutionStatus(taskInstance.getState());
|
|
|
|
- workflowExecuteThreadNotify.addStateEvent(stateEvent);
|
|
|
|
|
|
+ processInstanceCacheMap.put(processInstance.getId(), processInstance);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ // only failover the task owned myself if worker down.
|
|
|
|
+ if (processInstance.getHost().equalsIgnoreCase(getLocalAddress())) {
|
|
|
|
+ logger.info("failover task instance id: {}, process instance id: {}", taskInstance.getId(), taskInstance.getProcessInstanceId());
|
|
|
|
+ failoverTaskInstance(processInstance, taskInstance);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
- logger.info("end worker[{}] failover ...", workerHost);
|
|
|
|
|
|
+ logger.info("end worker[{}] failover, useTime:{}ms", workerHost, System.currentTimeMillis() - startTime);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
- * failover master tasks
|
|
|
|
|
|
+ * failover master
|
|
|
|
+ * <p>
|
|
|
|
+ * failover process instance and associated task instance
|
|
*
|
|
*
|
|
* @param masterHost master host
|
|
* @param masterHost master host
|
|
*/
|
|
*/
|
|
private void failoverMaster(String masterHost) {
|
|
private void failoverMaster(String masterHost) {
|
|
- logger.info("start master failover ...");
|
|
|
|
|
|
+ if (StringUtils.isEmpty(masterHost)) {
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
|
|
|
|
+ long startTime = System.currentTimeMillis();
|
|
List<ProcessInstance> needFailoverProcessInstanceList = processService.queryNeedFailoverProcessInstances(masterHost);
|
|
List<ProcessInstance> needFailoverProcessInstanceList = processService.queryNeedFailoverProcessInstances(masterHost);
|
|
|
|
+ logger.info("start master[{}] failover, process list size:{}", masterHost, needFailoverProcessInstanceList.size());
|
|
|
|
|
|
- logger.info("failover process list size:{} ", needFailoverProcessInstanceList.size());
|
|
|
|
- //updateProcessInstance host is null and insert into command
|
|
|
|
for (ProcessInstance processInstance : needFailoverProcessInstanceList) {
|
|
for (ProcessInstance processInstance : needFailoverProcessInstanceList) {
|
|
- logger.info("failover process instance id: {} host:{}", processInstance.getId(), processInstance.getHost());
|
|
|
|
if (Constants.NULL.equals(processInstance.getHost())) {
|
|
if (Constants.NULL.equals(processInstance.getHost())) {
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ logger.info("failover process instance id: {}", processInstance.getId());
|
|
|
|
+
|
|
|
|
+ List<TaskInstance> validTaskInstanceList = processService.findValidTaskListByProcessId(processInstance.getId());
|
|
|
|
+ for (TaskInstance taskInstance : validTaskInstanceList) {
|
|
|
|
+ if (Constants.NULL.equals(taskInstance.getHost())) {
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+ logger.info("failover task instance id: {}, process instance id: {}", taskInstance.getId(), taskInstance.getProcessInstanceId());
|
|
|
|
+ failoverTaskInstance(processInstance, taskInstance);
|
|
|
|
+ }
|
|
|
|
+ //updateProcessInstance host is null and insert into command
|
|
processService.processNeedFailoverProcessInstances(processInstance);
|
|
processService.processNeedFailoverProcessInstances(processInstance);
|
|
}
|
|
}
|
|
- failoverWorker(masterHost, true, false);
|
|
|
|
|
|
|
|
- logger.info("master failover end");
|
|
|
|
|
|
+ logger.info("master[{}] failover end, useTime:{}ms", masterHost, System.currentTimeMillis() - startTime);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private void failoverTaskInstance(ProcessInstance processInstance, TaskInstance taskInstance) {
|
|
|
|
+ if (taskInstance == null) {
|
|
|
|
+ logger.error("failover task instance error, taskInstance is null");
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (processInstance == null) {
|
|
|
|
+ logger.error("failover task instance error, processInstance {} of taskInstance {} is null",
|
|
|
|
+ taskInstance.getProcessInstanceId(), taskInstance.getId());
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (!checkTaskInstanceNeedFailover(taskInstance)) {
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ taskInstance.setProcessInstance(processInstance);
|
|
|
|
+ TaskExecutionContext taskExecutionContext = TaskExecutionContextBuilder.get()
|
|
|
|
+ .buildTaskInstanceRelatedInfo(taskInstance)
|
|
|
|
+ .buildProcessInstanceRelatedInfo(processInstance)
|
|
|
|
+ .create();
|
|
|
|
+
|
|
|
|
+ // only kill yarn job if exists , the local thread has exited
|
|
|
|
+ ProcessUtils.killYarnJob(taskExecutionContext);
|
|
|
|
+
|
|
|
|
+ taskInstance.setState(ExecutionStatus.NEED_FAULT_TOLERANCE);
|
|
|
|
+ processService.saveTaskInstance(taskInstance);
|
|
|
|
+
|
|
|
|
+ WorkflowExecuteThread workflowExecuteThreadNotify = processInstanceExecCacheManager.getByProcessInstanceId(processInstance.getId());
|
|
|
|
+ if (workflowExecuteThreadNotify == null) {
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+ StateEvent stateEvent = new StateEvent();
|
|
|
|
+ stateEvent.setTaskInstanceId(taskInstance.getId());
|
|
|
|
+ stateEvent.setType(StateEventType.TASK_STATE_CHANGE);
|
|
|
|
+ stateEvent.setProcessInstanceId(processInstance.getId());
|
|
|
|
+ stateEvent.setExecutionStatus(taskInstance.getState());
|
|
|
|
+ workflowExecuteThreadNotify.addStateEvent(stateEvent);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|