Skip to content

Commit ecfca79

Browse files
turboFeipan3793
andcommitted
[KYUUBI #7033] Treat YARN/Kubernetes application NOT_FOUND as failed to prevent data quality issue
### Why are the changes needed? Currently, NOT_FOUND application stated is treated as a terminated but not failed state. It might cause some data quality issue if downstream application depends on the batch state for data processing. So, I think we should treat NOT_FOUND as a failed state instead. Currently, we support 3 types of application manager. 1. [JpsApplicationOperation](https://github.com/apache/kyuubi/blob/master/kyuubi-server/src/main/scala/org/apache/kyuubi/engine/JpsApplicationOperation.scala) 2. [YarnApplicationOperation](https://github.com/apache/kyuubi/blob/master/kyuubi-server/src/main/scala/org/apache/kyuubi/engine/YarnApplicationOperation.scala) 3. [KubernetesApplicationOperation](https://github.com/apache/kyuubi/blob/master/kyuubi-server/src/main/scala/org/apache/kyuubi/engine/KubernetesApplicationOperation.scala) YarnApplicationOperation and KubernetesApplicationOperation are widely used in production use case. And in multiple kyuubi instance mode, the NOT_FOUND case should rarely happen. 1. https://github.com/apache/kyuubi/blob/7e199d6fdbdf52222bb3eadd056b9e5a2295f36e/kyuubi-server/src/main/scala/org/apache/kyuubi/server/api/v1/BatchesResource.scala#L369-L385 3. #7029 So, I think we should treat NOT_FOUND as a failed state in production use case. It is better to fail some corner cases than to mistakenly set unsuccessful batches to the finished state. ### How was this patch tested? GA. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #7033 from turboFei/revist_not_found. Closes #7033 ada4f88 [Cheng Pan] Update kyuubi-server/src/main/scala/org/apache/kyuubi/engine/ApplicationOperation.scala 985e23c [Wang, Fei] Refine f03d612 [Wang, Fei] comments b9d6ac2 [Wang, Fei] incase the metadata updated by peer instance 3bd61ca [Wang, Fei] add 339df47 [Wang, Fei] treat NOT_FOUND as failed Lead-authored-by: Wang, Fei <[email protected]> Co-authored-by: Cheng Pan <[email protected]> Signed-off-by: Cheng Pan <[email protected]>
1 parent 02a6b13 commit ecfca79

File tree

7 files changed

+60
-14
lines changed

7 files changed

+60
-14
lines changed

kyuubi-server/src/main/scala/org/apache/kyuubi/engine/ApplicationOperation.scala

+14-1
Original file line numberDiff line numberDiff line change
@@ -85,15 +85,28 @@ trait ApplicationOperation {
8585
tag: String,
8686
proxyUser: Option[String] = None,
8787
submitTime: Option[Long] = None): ApplicationInfo
88+
89+
/**
90+
* Whether the application state can be persisted and retrieved after finished.
91+
* @return true if the application state can be persisted
92+
*/
93+
def supportPersistedAppState: Boolean
8894
}
8995

9096
object ApplicationState extends Enumeration {
9197
type ApplicationState = Value
9298
val PENDING, RUNNING, FINISHED, KILLED, FAILED, ZOMBIE, NOT_FOUND, UNKNOWN = Value
9399

94-
def isFailed(state: ApplicationState): Boolean = state match {
100+
def isFailed(
101+
state: ApplicationState,
102+
appOperation: Option[ApplicationOperation]): Boolean = {
103+
isFailed(state, appOperation.exists(_.supportPersistedAppState))
104+
}
105+
106+
def isFailed(state: ApplicationState, supportPersistedAppState: Boolean): Boolean = state match {
95107
case FAILED => true
96108
case KILLED => true
109+
case NOT_FOUND if supportPersistedAppState => true
97110
case _ => false
98111
}
99112

kyuubi-server/src/main/scala/org/apache/kyuubi/engine/JpsApplicationOperation.scala

+2
Original file line numberDiff line numberDiff line change
@@ -105,5 +105,7 @@ class JpsApplicationOperation extends ApplicationOperation {
105105
// TODO check if the process is zombie
106106
}
107107

108+
override def supportPersistedAppState: Boolean = false
109+
108110
override def stop(): Unit = {}
109111
}

kyuubi-server/src/main/scala/org/apache/kyuubi/engine/KubernetesApplicationOperation.scala

+6-3
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ class KubernetesApplicationOperation extends ApplicationOperation with Logging {
137137
val shouldDelete = cleanupDriverPodStrategy match {
138138
case NONE => false
139139
case ALL => true
140-
case COMPLETED => !ApplicationState.isFailed(notification.getValue)
140+
case COMPLETED => !ApplicationState.isFailed(notification.getValue, Some(this))
141141
}
142142
if (shouldDelete) {
143143
deletePod(kubernetesInfo, removed.podName.orNull, appLabel)
@@ -289,6 +289,8 @@ class KubernetesApplicationOperation extends ApplicationOperation with Logging {
289289
}
290290
}
291291

292+
override def supportPersistedAppState: Boolean = true
293+
292294
override def stop(): Unit = {
293295
enginePodInformers.asScala.foreach { case (_, informer) =>
294296
Utils.tryLogNonFatalError(informer.stop())
@@ -594,8 +596,8 @@ object KubernetesApplicationOperation extends Logging {
594596
case Some(containerAppState) => containerAppState
595597
case None => podAppState
596598
}
597-
val applicationError =
598-
if (ApplicationState.isFailed(applicationState)) {
599+
val applicationError = {
600+
if (ApplicationState.isFailed(applicationState, supportPersistedAppState = true)) {
599601
val errorMap = containerStatusToBuildAppState.map { cs =>
600602
Map(
601603
"Pod" -> podName,
@@ -609,6 +611,7 @@ object KubernetesApplicationOperation extends Logging {
609611
} else {
610612
None
611613
}
614+
}
612615
applicationState -> applicationError
613616
}
614617

kyuubi-server/src/main/scala/org/apache/kyuubi/engine/KyuubiApplicationManager.scala

+5
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ class KyuubiApplicationManager(metadataManager: Option[MetadataManager])
103103
operations.find(_.isInstanceOf[KubernetesApplicationOperation])
104104
.map(_.asInstanceOf[KubernetesApplicationOperation])
105105
}
106+
107+
private[kyuubi] def getApplicationOperation(appMgrInfo: ApplicationManagerInfo)
108+
: Option[ApplicationOperation] = {
109+
operations.find(_.isSupported(appMgrInfo))
110+
}
106111
}
107112

108113
object KyuubiApplicationManager {

kyuubi-server/src/main/scala/org/apache/kyuubi/engine/YarnApplicationOperation.scala

+2
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ class YarnApplicationOperation extends ApplicationOperation with Logging {
165165
}
166166
}
167167

168+
override def supportPersistedAppState: Boolean = true
169+
168170
override def stop(): Unit = adminYarnClient.foreach { yarnClient =>
169171
Utils.tryLogNonFatalError(yarnClient.stop())
170172
}

kyuubi-server/src/main/scala/org/apache/kyuubi/operation/BatchJobSubmission.scala

+29-9
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ import java.util.concurrent.TimeUnit
2222

2323
import com.codahale.metrics.MetricRegistry
2424
import com.google.common.annotations.VisibleForTesting
25+
import org.apache.commons.lang3.StringUtils
2526

2627
import org.apache.kyuubi.{KyuubiException, KyuubiSQLException, Utils}
2728
import org.apache.kyuubi.config.KyuubiConf
28-
import org.apache.kyuubi.engine.{ApplicationInfo, ApplicationState, KillResponse, ProcBuilder}
29+
import org.apache.kyuubi.engine.{ApplicationInfo, ApplicationOperation, ApplicationState, KillResponse, ProcBuilder}
2930
import org.apache.kyuubi.engine.spark.SparkBatchProcessBuilder
3031
import org.apache.kyuubi.metrics.MetricsConstants.OPERATION_OPEN
3132
import org.apache.kyuubi.metrics.MetricsSystem
@@ -99,6 +100,8 @@ class BatchJobSubmission(
99100
getOperationLog)
100101
}
101102

103+
private lazy val appOperation = applicationManager.getApplicationOperation(builder.appMgrInfo())
104+
102105
def startupProcessAlive: Boolean =
103106
builder.processLaunched && Option(builder.process).exists(_.isAlive)
104107

@@ -212,6 +215,20 @@ class BatchJobSubmission(
212215
metadata match {
213216
case Some(metadata) if metadata.peerInstanceClosed =>
214217
setState(OperationState.CANCELED)
218+
case Some(metadata)
219+
// in case it has been updated by peer kyuubi instance, see KYUUBI #6278
220+
if StringUtils.isNotBlank(metadata.engineState) &&
221+
ApplicationState.isTerminated(ApplicationState.withName(metadata.engineState)) =>
222+
_applicationInfo = Some(new ApplicationInfo(
223+
id = metadata.engineId,
224+
name = metadata.engineName,
225+
state = ApplicationState.withName(metadata.engineState),
226+
url = Option(metadata.engineUrl),
227+
error = metadata.engineError))
228+
if (applicationFailed(_applicationInfo, appOperation)) {
229+
throw new KyuubiException(
230+
s"$batchType batch[$batchId] job failed: ${_applicationInfo}")
231+
}
215232
case Some(metadata) if metadata.state == OperationState.PENDING.toString =>
216233
// case 1: new batch job created using batch impl v2
217234
// case 2: batch job from recovery, do submission only when previous state is
@@ -275,7 +292,7 @@ class BatchJobSubmission(
275292
try {
276293
info(s"Submitting $batchType batch[$batchId] job:\n$builder")
277294
val process = builder.start
278-
while (process.isAlive && !applicationFailed(_applicationInfo)) {
295+
while (process.isAlive && !applicationFailed(_applicationInfo, appOperation)) {
279296
doUpdateApplicationInfoMetadataIfNeeded()
280297
process.waitFor(applicationCheckInterval, TimeUnit.MILLISECONDS)
281298
}
@@ -284,7 +301,7 @@ class BatchJobSubmission(
284301
doUpdateApplicationInfoMetadataIfNeeded()
285302
}
286303

287-
if (applicationFailed(_applicationInfo)) {
304+
if (applicationFailed(_applicationInfo, appOperation)) {
288305
Utils.terminateProcess(process, applicationStartupDestroyTimeout)
289306
throw new KyuubiException(s"Batch job failed: ${_applicationInfo}")
290307
}
@@ -329,10 +346,9 @@ class BatchJobSubmission(
329346
setStateIfNotCanceled(OperationState.RUNNING)
330347
}
331348
if (_applicationInfo.isEmpty) {
332-
info(s"The $batchType batch[$batchId] job: $appId not found, assume that it has finished.")
333-
return
349+
_applicationInfo = Some(ApplicationInfo.NOT_FOUND)
334350
}
335-
if (applicationFailed(_applicationInfo)) {
351+
if (applicationFailed(_applicationInfo, appOperation)) {
336352
throw new KyuubiException(s"$batchType batch[$batchId] job failed: ${_applicationInfo}")
337353
}
338354
updateBatchMetadata()
@@ -341,7 +357,7 @@ class BatchJobSubmission(
341357
Thread.sleep(applicationCheckInterval)
342358
updateApplicationInfoMetadataIfNeeded()
343359
}
344-
if (applicationFailed(_applicationInfo)) {
360+
if (applicationFailed(_applicationInfo, appOperation)) {
345361
throw new KyuubiException(s"$batchType batch[$batchId] job failed: ${_applicationInfo}")
346362
}
347363
}
@@ -445,8 +461,12 @@ class BatchJobSubmission(
445461
}
446462

447463
object BatchJobSubmission {
448-
def applicationFailed(applicationStatus: Option[ApplicationInfo]): Boolean = {
449-
applicationStatus.map(_.state).exists(ApplicationState.isFailed)
464+
def applicationFailed(
465+
applicationStatus: Option[ApplicationInfo],
466+
appOperation: Option[ApplicationOperation]): Boolean = {
467+
applicationStatus.map(_.state).exists { state =>
468+
ApplicationState.isFailed(state, appOperation)
469+
}
450470
}
451471

452472
def applicationTerminated(applicationStatus: Option[ApplicationInfo]): Boolean = {

kyuubi-server/src/main/scala/org/apache/kyuubi/server/api/v1/BatchesResource.scala

+2-1
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,9 @@ private[v1] class BatchesResource extends ApiRequestContext with Logging {
129129
metadata: Metadata,
130130
batchAppStatus: Option[ApplicationInfo]): Batch = {
131131
batchAppStatus.map { appStatus =>
132+
val appOp = sessionManager.applicationManager.getApplicationOperation(metadata.appMgrInfo)
132133
val currentBatchState =
133-
if (BatchJobSubmission.applicationFailed(batchAppStatus)) {
134+
if (BatchJobSubmission.applicationFailed(batchAppStatus, appOp)) {
134135
OperationState.ERROR.toString
135136
} else if (BatchJobSubmission.applicationTerminated(batchAppStatus)) {
136137
OperationState.FINISHED.toString

0 commit comments

Comments
 (0)