瀏覽代碼

Solve the deadlock problem caused by queuing (#13191)

* Solve the deadlock problem caused by queuing

* Solve the deadlock problem caused by queuing

* Solve the deadlock problem caused by queuing

* Solve the deadlock problem caused by queuing,move the event to the tail by throwing a exception

Co-authored-by: wfs <wangfushun@cdqcp.cpm>
sssqhai 2 年之前
父節點
當前提交
7a0a2c2a46

+ 33 - 0
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandleFailure.java

@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.dolphinscheduler.server.master.event;
+
+/**
+ * This exception represent the exception can be recovered, when we get this exception,
+ * we will move the event to the fail of the queue.
+ */
+public class StateEventHandleFailure extends Exception {
+
+    public StateEventHandleFailure(String message) {
+        super(message);
+    }
+
+    public StateEventHandleFailure(String message, Throwable throwable) {
+        super(message, throwable);
+    }
+}

+ 2 - 1
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java

@@ -28,9 +28,10 @@ public interface StateEventHandler {
      * @param stateEvent given state event.
      * @throws StateEventHandleException this exception means it can be recovered.
      * @throws StateEventHandleError     this exception means it cannot be recovered, so the event need to drop.
+     * @throws StateEventHandleException this means it can be recovered.
      */
     boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
-                             StateEvent stateEvent) throws StateEventHandleException, StateEventHandleError;
+                             StateEvent stateEvent) throws StateEventHandleException, StateEventHandleError, StateEventHandleFailure;
 
     StateEventType getEventType();
 }

+ 6 - 2
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java

@@ -31,9 +31,13 @@ public class TaskWaitTaskGroupStateHandler implements StateEventHandler {
     private static final Logger logger = LoggerFactory.getLogger(TaskWaitTaskGroupStateHandler.class);
 
     @Override
-    public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent) {
+    public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
+                                    StateEvent stateEvent) throws StateEventHandleFailure {
         logger.info("Handle task instance wait task group event, taskInstanceId: {}", stateEvent.getTaskInstanceId());
-        return workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent);
+        if (!workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent)) {
+            throw new StateEventHandleFailure("Task state event handle failed due to robing taskGroup resource failed");
+        }
+        return true;
     }
 
     @Override

+ 8 - 0
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java

@@ -71,6 +71,7 @@ import org.apache.dolphinscheduler.server.master.dispatch.executor.NettyExecutor
 import org.apache.dolphinscheduler.server.master.event.StateEvent;
 import org.apache.dolphinscheduler.server.master.event.StateEventHandleError;
 import org.apache.dolphinscheduler.server.master.event.StateEventHandleException;
+import org.apache.dolphinscheduler.server.master.event.StateEventHandleFailure;
 import org.apache.dolphinscheduler.server.master.event.StateEventHandler;
 import org.apache.dolphinscheduler.server.master.event.StateEventHandlerManager;
 import org.apache.dolphinscheduler.server.master.event.TaskStateEvent;
@@ -315,6 +316,13 @@ public class WorkflowExecuteRunnable implements Callable<WorkflowSubmitStatue> {
                         stateEvent,
                         stateEventHandleException);
                 ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
+            } catch (StateEventHandleFailure stateEventHandleFailure) {
+                logger.error("State event handle failed, will move event to the tail: {}",
+                        stateEvent,
+                        stateEventHandleFailure);
+                this.stateEvents.remove(stateEvent);
+                this.stateEvents.offer(stateEvent);
+                ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
             } catch (Exception e) {
                 // we catch the exception here, since if the state event handle failed, the state event will still keep
                 // in the stateEvents queue.