github.com/kubewharf/katalyst-core@v0.5.3/pkg/util/asyncworker/async_workers.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package asyncworker
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"reflect"
    23  	"strings"
    24  	"time"
    25  
    26  	"k8s.io/apimachinery/pkg/util/wait"
    27  
    28  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    29  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    30  )
    31  
    32  func NewAsyncWorkers(name string, emitter metrics.MetricEmitter) *AsyncWorkers {
    33  	return &AsyncWorkers{
    34  		name:                name,
    35  		emitter:             emitter,
    36  		lastUndeliveredWork: make(map[string]*Work),
    37  		workStatuses:        make(map[string]*workStatus),
    38  	}
    39  }
    40  
    41  func (aws *AsyncWorkers) AddWork(workName string, work *Work, policy DuplicateWorkPolicy) error {
    42  	aws.workLock.Lock()
    43  	defer aws.workLock.Unlock()
    44  
    45  	err := validateWork(work)
    46  	if err != nil {
    47  		return fmt.Errorf("validateWork for: %s failed with error: %v", workName, err)
    48  	}
    49  
    50  	general.InfoS("add work",
    51  		"AsyncWorkers", aws.name,
    52  		"workName", workName,
    53  		"params", work.Params,
    54  		"deliveredAt", work.DeliveredAt)
    55  
    56  	status, ok := aws.workStatuses[workName]
    57  	if !ok || status == nil {
    58  		general.InfoS("create status for work",
    59  			"AsyncWorkers", aws.name, "workName", workName)
    60  		status = &workStatus{}
    61  		aws.workStatuses[workName] = status
    62  	} else if status.IsWorking() && policy == DuplicateWorkPolicyDiscard {
    63  		general.InfoS("work %v already exists, discard new work", workName)
    64  		return nil
    65  	}
    66  
    67  	// dispatch a request to the pod work if none are running
    68  	if !status.IsWorking() {
    69  		general.InfoS("status isn't working, handle work immediately",
    70  			"AsyncWorkers", aws.name,
    71  			"workName", workName,
    72  			"params", work.Params,
    73  			"deliveredAt", work.DeliveredAt)
    74  
    75  		ctx := aws.contextForWork(workName, work)
    76  		go aws.handleWork(ctx, workName, work)
    77  
    78  		return nil
    79  	}
    80  
    81  	general.InfoS("status is working, queue work",
    82  		"AsyncWorkers", aws.name,
    83  		"workName", workName,
    84  		"params", work.Params,
    85  		"deliveredAt", work.DeliveredAt)
    86  
    87  	if undelivered, ok := aws.lastUndeliveredWork[workName]; ok {
    88  		general.InfoS("overwrite undelivered work",
    89  			"AsyncWorkers", aws.name,
    90  			"workName", workName,
    91  			"old params", undelivered.Params,
    92  			"old deliveredAt", undelivered.DeliveredAt,
    93  			"new params", work.Params,
    94  			"new deliveredAt", work.DeliveredAt)
    95  	}
    96  
    97  	// always set the most recent work
    98  	aws.lastUndeliveredWork[workName] = work
    99  
   100  	if status.cancelFn == nil {
   101  		general.Fatalf("[AsyncWorkers: %s] %s nil cancelFn in working status", aws.name, workName)
   102  	} else if status.work == nil {
   103  		general.Fatalf("[AsyncWorkers: %s] %s nil work in working status", aws.name, workName)
   104  	}
   105  
   106  	general.InfoS("canceling current working work",
   107  		"AsyncWorkers", aws.name,
   108  		"workName", workName,
   109  		"params", status.work.Params,
   110  		"deliveredAt", status.work.DeliveredAt)
   111  	status.cancelFn()
   112  
   113  	return nil
   114  }
   115  
   116  func (aws *AsyncWorkers) handleWork(ctx context.Context, workName string, work *Work) {
   117  	var handleErr error
   118  
   119  	defer func() {
   120  		if r := recover(); r != nil {
   121  			handleErr = fmt.Errorf("recover from %v", r)
   122  
   123  			metricErr := EmitCustomizedAsyncedMetrics(ctx,
   124  				metricNameAsyncWorkPanic, 1,
   125  				metrics.ConvertMapToTags(map[string]string{
   126  					"workName": workName,
   127  				})...)
   128  
   129  			if metricErr != nil {
   130  				general.Errorf("emit metric(%s:%d) failed with err: %v",
   131  					metricNameAsyncWorkDurationMs, 1, metricErr)
   132  			}
   133  		}
   134  
   135  		aws.completeWork(workName, work, handleErr)
   136  	}()
   137  
   138  	general.InfoS("handle work",
   139  		"AsyncWorkers", aws.name,
   140  		"workName", workName,
   141  		"params", work.Params,
   142  		"deliveredAt", work.DeliveredAt)
   143  
   144  	funcValue := reflect.ValueOf(work.Fn)
   145  
   146  	// filling up parameters for the passed functions
   147  	paramValues := make([]reflect.Value, 1, len(work.Params)+1)
   148  	paramValues[0] = reflect.ValueOf(ctx)
   149  	for _, param := range work.Params {
   150  		paramValues = append(paramValues, reflect.ValueOf(param))
   151  	}
   152  
   153  	startTime := time.Now()
   154  	funcRets := funcValue.Call(paramValues)
   155  	workDurationMs := time.Since(startTime).Milliseconds()
   156  
   157  	if len(funcRets) != 1 {
   158  		handleErr = fmt.Errorf("work Fn returns invalid number: %d of return values", len(funcRets))
   159  	} else if funcRets[0].Interface() != nil {
   160  		var ok bool
   161  		handleErr, ok = funcRets[0].Interface().(error)
   162  
   163  		if !ok {
   164  			handleErr = fmt.Errorf("work Fn returns return value: %v of invalid type", funcRets[0].Interface())
   165  		}
   166  	}
   167  
   168  	metricErr := EmitCustomizedAsyncedMetrics(ctx,
   169  		metricNameAsyncWorkDurationMs, workDurationMs,
   170  		metrics.ConvertMapToTags(map[string]string{
   171  			"workName": workName,
   172  		})...)
   173  
   174  	if metricErr != nil {
   175  		general.Errorf("emit metric(%s:%d) failed with err: %v",
   176  			metricNameAsyncWorkDurationMs, workDurationMs, metricErr)
   177  	}
   178  }
   179  
   180  func (aws *AsyncWorkers) completeWork(workName string, completedWork *Work, workErr error) {
   181  	// TODO: support retrying if workErr != nil
   182  	general.InfoS("complete work",
   183  		"AsyncWorkers", aws.name,
   184  		"workName", workName,
   185  		"params", completedWork.Params,
   186  		"deliveredAt", completedWork.DeliveredAt,
   187  		"workErr", workErr)
   188  
   189  	aws.workLock.Lock()
   190  	defer aws.workLock.Unlock()
   191  
   192  	if work, exists := aws.lastUndeliveredWork[workName]; exists {
   193  
   194  		ctx := aws.contextForWork(workName, work)
   195  
   196  		go aws.handleWork(ctx, workName, work)
   197  		delete(aws.lastUndeliveredWork, workName)
   198  	} else {
   199  		aws.resetWorkStatus(workName)
   200  	}
   201  }
   202  
   203  // contextForWork returns or initializes the appropriate context for a known
   204  // work. And point status.work to the work. If the current context is expired, it is reset.
   205  // It should be called in function protected by aws.workLock.
   206  func (aws *AsyncWorkers) contextForWork(workName string, work *Work) context.Context {
   207  	if work == nil {
   208  		general.Fatalf("[AsyncWorkers: %s] contextForWork: %s got nil work", aws.name, workName)
   209  	}
   210  
   211  	status, ok := aws.workStatuses[workName]
   212  	if !ok || status == nil {
   213  		general.Fatalf("[AsyncWorkers: %s] contextForWork: %s got no status", aws.name, workName)
   214  	}
   215  	if status.ctx == nil || status.ctx.Err() == context.Canceled {
   216  		ctx := context.Background()
   217  		if names := strings.Split(workName, WorkNameSeperator); len(names) > 0 {
   218  			ctx = context.WithValue(ctx, contextKeyMetricName, names[len(names)-1])
   219  			ctx = context.WithValue(ctx, contextKeyMetricEmitter, aws.emitter)
   220  		}
   221  		status.ctx, status.cancelFn = context.WithCancel(ctx)
   222  
   223  	}
   224  	status.working = true
   225  	status.work = work
   226  	status.startedAt = time.Now()
   227  	return status.ctx
   228  }
   229  
   230  // resetWorkStatus resets work status corresponding to workName,
   231  // when there is no work of workName to do.
   232  // It should be called in function protected by aws.workLock.
   233  func (aws *AsyncWorkers) resetWorkStatus(workName string) {
   234  	status, ok := aws.workStatuses[workName]
   235  	if !ok || status == nil {
   236  		general.Fatalf("[AsyncWorkers: %s] contextForWork: %s got no status",
   237  			aws.name, workName)
   238  	}
   239  
   240  	status.working = false
   241  	status.work = nil
   242  	status.startedAt = time.Time{}
   243  }
   244  
   245  func (aws *AsyncWorkers) Start(stopCh <-chan struct{}) error {
   246  	go wait.Until(aws.cleanupWorkStatus, 10*time.Second, stopCh)
   247  	return nil
   248  }
   249  
   250  // cleanupWorkStatus cleans up work status not in working
   251  func (aws *AsyncWorkers) cleanupWorkStatus() {
   252  	aws.workLock.Lock()
   253  	defer aws.workLock.Unlock()
   254  
   255  	for workName, status := range aws.workStatuses {
   256  		if status == nil {
   257  			general.Errorf("[AsyncWorkers: %s] nil status for %s, clean it", aws.name, workName)
   258  			delete(aws.workStatuses, workName)
   259  		} else if !status.working {
   260  			general.Errorf("[AsyncWorkers: %s] status for %s not in working, clean it", aws.name, workName)
   261  			delete(aws.workStatuses, workName)
   262  		}
   263  	}
   264  }
   265  
   266  func (aws *AsyncWorkers) WorkExists(workName string) bool {
   267  	aws.workLock.Lock()
   268  	defer aws.workLock.Unlock()
   269  
   270  	status, hasRunningWork := aws.workStatuses[workName]
   271  	if hasRunningWork && status.IsWorking() {
   272  		return true
   273  	}
   274  
   275  	_, hasUndeliveredWork := aws.lastUndeliveredWork[workName]
   276  	if hasUndeliveredWork {
   277  		return true
   278  	}
   279  
   280  	return false
   281  }