diff --git a/header/src/main/java/org/zstack/header/storage/ceph/CephSiblingFenceExtensionPoint.java b/header/src/main/java/org/zstack/header/storage/ceph/CephSiblingFenceExtensionPoint.java new file mode 100644 index 00000000000..a9a4ed54550 --- /dev/null +++ b/header/src/main/java/org/zstack/header/storage/ceph/CephSiblingFenceExtensionPoint.java @@ -0,0 +1,18 @@ +package org.zstack.header.storage.ceph; + +import org.zstack.header.core.ReturnValueCompletion; + +/** + * SPI for Ceph HA sibling-fence (ZSTAC-83890). + * Implementation lives in premium {@code storage-ha-plugin}. + * Called from {@code CephPrimaryStorageFactory.preInstantiateVmResource} when + * Ceph watcher list is empty — SSH-kills stale QEMU on the failed host before + * HA starts the VM to prevent split-brain. + */ +public interface CephSiblingFenceExtensionPoint { + void fenceVmOnFailedHost(String failedHostUuid, + String vmUuid, + String clusterUuid, + String haTargetHostUuid, + ReturnValueCompletion completion); +} diff --git a/header/src/main/java/org/zstack/header/storage/ceph/SiblingFenceVmOnHostMsg.java b/header/src/main/java/org/zstack/header/storage/ceph/SiblingFenceVmOnHostMsg.java new file mode 100644 index 00000000000..a3bfad147cc --- /dev/null +++ b/header/src/main/java/org/zstack/header/storage/ceph/SiblingFenceVmOnHostMsg.java @@ -0,0 +1,42 @@ +package org.zstack.header.storage.ceph; + +import org.zstack.header.message.NeedReplyMessage; + +public class SiblingFenceVmOnHostMsg extends NeedReplyMessage { + private String failedHostUuid; + private String vmUuid; + private String clusterUuid; + private String haTargetHostUuid; + + public String getFailedHostUuid() { + return failedHostUuid; + } + + public void setFailedHostUuid(String failedHostUuid) { + this.failedHostUuid = failedHostUuid; + } + + public String getVmUuid() { + return vmUuid; + } + + public void setVmUuid(String vmUuid) { + this.vmUuid = vmUuid; + } + + public String getClusterUuid() { + return clusterUuid; + } + + public void setClusterUuid(String clusterUuid) { + this.clusterUuid = clusterUuid; + } + + public String getHaTargetHostUuid() { + return haTargetHostUuid; + } + + public void setHaTargetHostUuid(String haTargetHostUuid) { + this.haTargetHostUuid = haTargetHostUuid; + } +} diff --git a/header/src/main/java/org/zstack/header/storage/ceph/SiblingFenceVmOnHostReply.java b/header/src/main/java/org/zstack/header/storage/ceph/SiblingFenceVmOnHostReply.java new file mode 100644 index 00000000000..400297f8f6b --- /dev/null +++ b/header/src/main/java/org/zstack/header/storage/ceph/SiblingFenceVmOnHostReply.java @@ -0,0 +1,69 @@ +package org.zstack.header.storage.ceph; + +import org.zstack.header.message.MessageReply; + +public class SiblingFenceVmOnHostReply extends MessageReply { + private boolean alive; + private boolean killed; + private boolean sshReachable; + private boolean qemuFound; + private String executorHostUuid; + private String executorRole; + private String reason; + + public boolean isAlive() { + return alive; + } + + public void setAlive(boolean alive) { + this.alive = alive; + } + + public boolean isKilled() { + return killed; + } + + public void setKilled(boolean killed) { + this.killed = killed; + } + + public boolean isSshReachable() { + return sshReachable; + } + + public void setSshReachable(boolean sshReachable) { + this.sshReachable = sshReachable; + } + + public boolean isQemuFound() { + return qemuFound; + } + + public void setQemuFound(boolean qemuFound) { + this.qemuFound = qemuFound; + } + + public String getExecutorHostUuid() { + return executorHostUuid; + } + + public void setExecutorHostUuid(String executorHostUuid) { + this.executorHostUuid = executorHostUuid; + } + + public String getExecutorRole() { + return executorRole; + } + + public void setExecutorRole(String executorRole) { + this.executorRole = executorRole; + } + + public String getReason() { + return reason; + } + + public void setReason(String reason) { + this.reason = reason; + } +} diff --git a/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageFactory.java b/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageFactory.java index cb7dde9c5cc..2c6cc8b71e2 100755 --- a/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageFactory.java +++ b/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageFactory.java @@ -34,6 +34,7 @@ import org.zstack.header.configuration.userconfig.InstanceOfferingUserConfig; import org.zstack.header.configuration.userconfig.InstanceOfferingUserConfigValidator; import org.zstack.header.core.Completion; +import org.zstack.header.core.ReturnValueCompletion; import org.zstack.header.core.WhileDoneCompletion; import org.zstack.header.core.progress.TaskProgressRange; import org.zstack.header.core.workflow.*; @@ -47,6 +48,8 @@ import org.zstack.header.host.HostVO_; import org.zstack.header.message.MessageReply; import org.zstack.header.storage.backup.*; +import org.zstack.header.storage.ceph.CephSiblingFenceExtensionPoint; +import org.zstack.header.storage.ceph.SiblingFenceVmOnHostReply; import org.zstack.header.storage.primary.*; import org.zstack.header.storage.snapshot.*; import org.zstack.header.vm.*; @@ -1229,7 +1232,50 @@ public void run(MessageReply reply) { GetVolumeWatchersReply rly = (GetVolumeWatchersReply)reply; List watchers = rly.getWatchers(); if (watchers == null || watchers.isEmpty()) { - completion.success(); + List exts = + pluginRgty.getExtensionList(CephSiblingFenceExtensionPoint.class); + if (exts.isEmpty()) { + // Open-source / no premium installed -> preserve original behaviour + completion.success(); + return; + } + + final String vmUuid = spec.getVmInventory().getUuid(); + final String haTargetHostUuid = spec.getDestHost().getUuid(); + final String clusterUuid = spec.getDestHost().getClusterUuid(); + final String failedHostUuid = spec.getVmInventory().getLastHostUuid() != null + ? spec.getVmInventory().getLastHostUuid() + : spec.getVmInventory().getHostUuid(); + if (failedHostUuid == null) { + // No previous host -> fresh instantiation, no fence needed + completion.success(); + return; + } + + CephSiblingFenceExtensionPoint ext = exts.get(0); + logger.debug(String.format("dispatching ceph sibling-fence for vm[%s] on failedHost[%s], target[%s]", + vmUuid, failedHostUuid, haTargetHostUuid)); + ext.fenceVmOnFailedHost(failedHostUuid, vmUuid, clusterUuid, haTargetHostUuid, + new ReturnValueCompletion(completion) { + @Override + public void success(SiblingFenceVmOnHostReply r) { + if (!r.isAlive() || r.isKilled()) { + logger.debug(String.format("ceph sibling-fence succeeded for vm[%s] on failedHost[%s]: killed=%s sshReachable=%s", + vmUuid, failedHostUuid, r.isKilled(), r.isSshReachable())); + completion.success(); + } else { + logger.warn(String.format("ceph sibling-fence failed for vm[%s] on failedHost[%s]: %s — blocking VM start to prevent split-brain", + vmUuid, failedHostUuid, r.getReason())); + completion.fail(operr("sibling fence failed: %s", r.getReason())); + } + } + @Override + public void fail(ErrorCode err) { + logger.warn(String.format("ceph sibling-fence error for vm[%s] on failedHost[%s]: %s", + vmUuid, failedHostUuid, err)); + completion.fail(err); + } + }); return; }