Browse Source

[bug fix] fix: The workflow is fault-tolerant and 2 task instances are generated (#2833)

* feature: add number configuration for master dispatch tasks

* fix bug(#2762) the master would be blocked when worker group not exists

* fix bug(#2762) the master would be blocked when worker group not exists

* fix ut

* fix ut

* fix bug(2781): cannot pause work flow when task state is "submit success"

* fix code smell

* add mysql other param blank judge

* test

* update comments

* update comments

* add ut

* fix bug: Restart the worker service again, the previously submitted successful tasks are not executed

* update comments

* add sleep

* add null point check

* fix bug:After the master is fault-tolerant, it cannot resume operation

* fix bug: do not failover the host is 'NULL' process

* fix bug:worker failover error.

Co-authored-by: baoliang <baoliang@analysys.com.cn>
bao liang 4 years ago
parent
commit
d67436ffad

+ 7 - 3
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/worker/processor/TaskCallbackService.java

@@ -95,14 +95,18 @@ public class TaskCallbackService {
         if(newChannel != null){
             return getRemoteChannel(newChannel, nettyRemoteChannel.getOpaque(), taskInstanceId);
         }
-        logger.warn("original master : {} is not reachable, random select master", nettyRemoteChannel.getHost());
+        logger.warn("original master : {} for task : {} is not reachable, random select master",
+                nettyRemoteChannel.getHost(),
+                taskInstanceId);
         Set<String> masterNodes = null;
         while (Stopper.isRunning()) {
             masterNodes = zookeeperRegistryCenter.getMasterNodesDirectly();
             if (CollectionUtils.isEmpty(masterNodes)) {
-                logger.error("no available master node");
                 ThreadUtils.sleep(SLEEP_TIME_MILLIS);
             }else {
+                logger.error("find {} masters for task : {}.",
+                        masterNodes.size(),
+                        taskInstanceId);
                 break;
             }
         }
@@ -112,7 +116,7 @@ public class TaskCallbackService {
                 return getRemoteChannel(newChannel, nettyRemoteChannel.getOpaque(), taskInstanceId);
             }
         }
-        throw new IllegalStateException(String.format("all available master nodes : %s are not reachable", masterNodes));
+        throw new IllegalStateException(String.format("all available master nodes : %s are not reachable for task: {}", masterNodes, taskInstanceId));
     }
 
     private NettyRemoteChannel getRemoteChannel(Channel newChannel, long opaque, int taskInstanceId){

+ 1 - 1
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/zk/ZKMasterClient.java

@@ -262,7 +262,7 @@ public class ZKMasterClient extends AbstractZKClient {
 		Date workerServerStartDate = null;
 		List<Server> workerServers = getServersList(ZKNodeType.WORKER);
 		for(Server workerServer : workerServers){
-			if(workerServer.getHost().equals(taskInstance.getHost())){
+		    if(taskInstance.getHost().equals(workerServer.getHost() + Constants.COLON + workerServer.getPort())){
 				workerServerStartDate = workerServer.getCreateTime();
 				break;
 			}

+ 1 - 1
dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/zk/AbstractZKClient.java

@@ -187,7 +187,7 @@ public abstract class AbstractZKClient extends ZookeeperCachedOperator {
 		}
 		Map<String, String> serverMaps = getServerMaps(zkNodeType);
 		for(String hostKey : serverMaps.keySet()){
-			if(hostKey.startsWith(host)){
+			if(hostKey.contains(host)){
 				return true;
 			}
 		}