Browse Source

[fix][worker][bug] master/worker crash when registry recover from SUSPENDED to RECONNECTED (#13328)

hokie-chan 2 years ago
parent
commit
3b980cb06a

+ 16 - 12
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterWaitingStrategy.java

@@ -93,18 +93,22 @@ public class MasterWaitingStrategy implements MasterConnectStrategy {
 
 
     @Override
     @Override
     public void reconnect() {
     public void reconnect() {
-        try {
-            ServerLifeCycleManager.recoverFromWaiting();
-            reStartMasterResource();
-            // reopen the resource
-            logger.info("Recover from waiting success, the current server status is {}",
-                    ServerLifeCycleManager.getServerStatus());
-        } catch (Exception e) {
-            String errorMessage =
-                    String.format("Recover from waiting failed, the current server status is %s, will stop the server",
-                            ServerLifeCycleManager.getServerStatus());
-            logger.error(errorMessage, e);
-            registryClient.getStoppable().stop(errorMessage);
+        if (ServerLifeCycleManager.isRunning()) {
+            logger.info("no need to reconnect, as the current server status is running");
+        } else {
+            try {
+                ServerLifeCycleManager.recoverFromWaiting();
+                reStartMasterResource();
+                logger.info("Recover from waiting success, the current server status is {}",
+                        ServerLifeCycleManager.getServerStatus());
+            } catch (Exception e) {
+                String errorMessage =
+                        String.format(
+                                "Recover from waiting failed, the current server status is %s, will stop the server",
+                                ServerLifeCycleManager.getServerStatus());
+                logger.error(errorMessage, e);
+                registryClient.getStoppable().stop(errorMessage);
+            }
         }
         }
     }
     }
 
 

+ 16 - 12
dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/registry/WorkerWaitingStrategy.java

@@ -93,19 +93,23 @@ public class WorkerWaitingStrategy implements WorkerConnectStrategy {
 
 
     @Override
     @Override
     public void reconnect() {
     public void reconnect() {
-        try {
-            ServerLifeCycleManager.recoverFromWaiting();
-            reStartWorkerResource();
-            logger.info("Recover from waiting success, the current server status is {}",
-                    ServerLifeCycleManager.getServerStatus());
-        } catch (Exception e) {
-            String errorMessage =
-                    String.format("Recover from waiting failed, the current server status is %s, will stop the server",
-                            ServerLifeCycleManager.getServerStatus());
-            logger.error(errorMessage, e);
-            registryClient.getStoppable().stop(errorMessage);
+        if (ServerLifeCycleManager.isRunning()) {
+            logger.info("no need to reconnect, as the current server status is running");
+        } else {
+            try {
+                ServerLifeCycleManager.recoverFromWaiting();
+                reStartWorkerResource();
+                logger.info("Recover from waiting success, the current server status is {}",
+                        ServerLifeCycleManager.getServerStatus());
+            } catch (Exception e) {
+                String errorMessage =
+                        String.format(
+                                "Recover from waiting failed, the current server status is %s, will stop the server",
+                                ServerLifeCycleManager.getServerStatus());
+                logger.error(errorMessage, e);
+                registryClient.getStoppable().stop(errorMessage);
+            }
         }
         }
-
     }
     }
 
 
     @Override
     @Override