Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distinguish between different VDDK validation errors #969

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 145 additions & 35 deletions pkg/controller/plan/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ const (
unsupportedVersion = "UnsupportedVersion"
VDDKInvalid = "VDDKInvalid"
ValidatingVDDK = "ValidatingVDDK"
VDDKInitImageNotReady = "VDDKInitImageNotReady"
VDDKInitImageUnavailable = "VDDKInitImageUnavailable"
)

// Categories
Expand Down Expand Up @@ -760,6 +762,42 @@ func (r *Reconciler) validateHooks(plan *api.Plan) (err error) {
}

func (r *Reconciler) validateVddkImage(plan *api.Plan) (err error) {
source := plan.Referenced.Provider.Source
if source == nil {
return liberr.New("source provider is not set")
}
destination := plan.Referenced.Provider.Destination
if destination == nil {
return liberr.New("destination provider is not set")
}

if source.Type() != api.VSphere {
// VDDK is not used for other provider types
return
mnecas marked this conversation as resolved.
Show resolved Hide resolved
}

if _, found := source.Spec.Settings[api.VDDK]; found {
var job *batchv1.Job
if job, err = r.ensureVddkImageValidationJob(plan); err != nil {
return
}
err = r.validateVddkImageJob(job, plan)
}

return
}

func jobExceedsDeadline(job *batchv1.Job) bool {
ActiveDeadlineSeconds := settings.Settings.Migration.VddkJobActiveDeadline

if job.Status.StartTime == nil {
return false
}
return meta.Now().Sub(job.Status.StartTime.Time).Seconds() > float64(ActiveDeadlineSeconds)
}

func (r *Reconciler) validateVddkImageJob(job *batchv1.Job, plan *api.Plan) (err error) {
image := plan.Referenced.Provider.Source.Spec.Settings[api.VDDK]
vddkInvalid := libcnd.Condition{
Type: VDDKInvalid,
Status: True,
Expand All @@ -775,49 +813,118 @@ func (r *Reconciler) validateVddkImage(plan *api.Plan) (err error) {
Message: "Validating VDDK init image",
}

source := plan.Referenced.Provider.Source
if source == nil {
if len(job.Status.Conditions) == 0 {
r.Log.Info("validation of VDDK job is in progress", "image", image)
plan.Status.SetCondition(vddkValidationInProgress)
}
var ctx *plancontext.Context
ctx, err = plancontext.New(r, plan, r.Log)
if err != nil {
return
}
destination := plan.Referenced.Provider.Destination
if destination == nil {
// check if a pod exists for the job
pods := &core.PodList{}
if err = ctx.Destination.Client.List(context.TODO(), pods, &client.ListOptions{
Namespace: plan.Spec.TargetNamespace,
LabelSelector: labels.SelectorFromSet(map[string]string{"job-name": job.Name}),
}); err != nil {
return
}
if len(pods.Items) > 0 {
pod := pods.Items[0]
if len(pod.Status.InitContainerStatuses) == 0 {
return liberr.New("Validation pod doesn't contain expected init container", "pod", pod)
}
waiting := pod.Status.InitContainerStatuses[0].State.Waiting
if waiting != nil {
if jobExceedsDeadline(job) {
// If we've exceeded the deadline, set a `warning` condition to increase
// severity. Don't set it as `critical` because the job will continue retrying
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think it should be cricital so we won't allow the users to start the migration.

// indefinitely until the pull succeeds or the provider's vddk init image URL is
// updated.
plan.Status.SetCondition(libcnd.Condition{
Type: VDDKInitImageUnavailable,
Status: True,
Reason: waiting.Reason,
Category: Warn,
Message: "Unable to Pull VDDK init image. Check that the image URL is correct.",
})
} else {
plan.Status.SetCondition(libcnd.Condition{
Type: VDDKInitImageNotReady,
Status: True,
Reason: waiting.Reason,
Category: Advisory,
Message: waiting.Message,
})
}
} else {
plan.Status.DeleteCondition(VDDKInitImageNotReady)
}
}
for _, condition := range job.Status.Conditions {
switch condition.Type {
case batchv1.JobComplete:
r.Log.Info("validate VDDK job completed", "image", image)
err = nil
return
case batchv1.JobFailed:
plan.Status.SetCondition(vddkInvalid)
err = nil
return
default:
err = liberr.New("validation of VDDK job has an unexpected condition", "type", condition.Type)
}
}

return
}

// Cancel all other vddk validation jobs that are currently running for the
// plan. This is necessary because validation jobs do not have a deadline,
// so they will keep trying indefinitely if they can't pull the image. If the
// VDDK URL is later changed, we will launch a new validation job, and the old
// validation job is no longer relevant, so we can just kill it.
func (r *Reconciler) cancelOtherActiveVddkCheckJobs(plan *api.Plan) (err error) {
ctx, err := plancontext.New(r, plan, r.Log)
if err != nil {
return
}
jobLabels := getVddkImageValidationJobLabels(ctx.Plan)

if source.Type() != api.VSphere {
// VDDK is not used for other provider types
queryLabels := make(map[string]string, 1)
queryLabels["plan"] = jobLabels["plan"]
delete(queryLabels, "vddk")

jobs := &batchv1.JobList{}
if err = ctx.Destination.Client.List(
context.TODO(),
jobs,
&client.ListOptions{
LabelSelector: labels.SelectorFromSet(queryLabels),
Namespace: plan.Spec.TargetNamespace,
},
); err != nil {
return
}

if image, found := source.Spec.Settings[api.VDDK]; found {
var job *batchv1.Job
if job, err = r.ensureVddkImageValidationJob(plan, image); err != nil {
return
}
if len(job.Status.Conditions) == 0 {
r.Log.Info("validation of VDDK job is in progress", "image", image)
plan.Status.SetCondition(vddkValidationInProgress)
}
for _, condition := range job.Status.Conditions {
switch condition.Type {
case batchv1.JobComplete:
r.Log.Info("validate VDDK job completed", "image", image)
err = nil
return
case batchv1.JobFailed:
plan.Status.SetCondition(vddkInvalid)
err = nil
for _, job := range jobs.Items {
if job.Status.Active > 0 && job.Labels["vddk"] != jobLabels["vddk"] {
r.Log.Info("Another validation job is active for this plan. Stopping...", "job", job)
// make sure to delete the pod associated with this job so that it doesn't
// become orphaned while trying to pull its image indefinitely
fg := meta.DeletePropagationForeground
opts := &client.DeleteOptions{PropagationPolicy: &fg}
if err = ctx.Destination.Client.Delete(context.TODO(), &job, opts); err != nil {
return
default:
err = liberr.New("validation of VDDK job has an unexpected condition", "type", condition.Type)
}
}
}

return
return nil
}

func (r *Reconciler) ensureVddkImageValidationJob(plan *api.Plan, vddkImage string) (*batchv1.Job, error) {
func (r *Reconciler) ensureVddkImageValidationJob(plan *api.Plan) (*batchv1.Job, error) {
ctx, err := plancontext.New(r, plan, r.Log)
if err != nil {
return nil, err
Expand All @@ -827,6 +934,8 @@ func (r *Reconciler) ensureVddkImageValidationJob(plan *api.Plan, vddkImage stri
return nil, liberr.Wrap(err)
}

r.cancelOtherActiveVddkCheckJobs(ctx.Plan)

jobLabels := getVddkImageValidationJobLabels(ctx.Plan)
jobs := &batchv1.JobList{}
err = ctx.Destination.Client.List(
Expand All @@ -841,7 +950,7 @@ func (r *Reconciler) ensureVddkImageValidationJob(plan *api.Plan, vddkImage stri
case err != nil:
return nil, err
case len(jobs.Items) == 0:
job := createVddkCheckJob(ctx.Plan, jobLabels, vddkImage)
job := createVddkCheckJob(ctx.Plan)
err = ctx.Destination.Client.Create(context.Background(), job)
if err != nil {
return nil, err
Expand Down Expand Up @@ -872,7 +981,9 @@ func getVddkImageValidationJobLabels(plan *api.Plan) map[string]string {
}
}

func createVddkCheckJob(plan *api.Plan, labels map[string]string, vddkImage string) *batchv1.Job {
func createVddkCheckJob(plan *api.Plan) *batchv1.Job {
vddkImage := plan.Referenced.Provider.Source.Spec.Settings[api.VDDK]

mount := core.VolumeMount{
Name: VddkVolumeName,
MountPath: "/opt",
Expand Down Expand Up @@ -913,21 +1024,20 @@ func createVddkCheckJob(plan *api.Plan, labels map[string]string, vddkImage stri
ObjectMeta: meta.ObjectMeta{
GenerateName: fmt.Sprintf("vddk-validator-%s", plan.Name),
Namespace: plan.Spec.TargetNamespace,
Labels: labels,
Labels: getVddkImageValidationJobLabels(plan),
Annotations: map[string]string{
"provider": plan.Referenced.Provider.Source.Name,
"vddk": vddkImage,
"plan": plan.Name,
},
},
Spec: batchv1.JobSpec{
ActiveDeadlineSeconds: ptr.To[int64](int64(settings.Settings.Migration.VddkJobActiveDeadline)),
BackoffLimit: ptr.To[int32](2),
Completions: ptr.To[int32](1),
BackoffLimit: ptr.To[int32](2),
Completions: ptr.To[int32](1),
Template: core.PodTemplateSpec{
Spec: core.PodSpec{
SecurityContext: psc,
RestartPolicy: core.RestartPolicyOnFailure,
RestartPolicy: core.RestartPolicyNever,
InitContainers: initContainers,
Containers: []core.Container{
{
Expand Down
Loading