github.com/pachyderm/pachyderm@v1.13.4/src/server/worker/stats/stats_test.go (about)

     1  package stats
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"os"
     7  	"strings"
     8  	"testing"
     9  	"time"
    10  
    11  	"golang.org/x/net/context"
    12  
    13  	"github.com/pachyderm/pachyderm/src/client"
    14  	"github.com/pachyderm/pachyderm/src/client/pfs"
    15  	"github.com/pachyderm/pachyderm/src/client/pkg/require"
    16  	"github.com/pachyderm/pachyderm/src/client/pps"
    17  	tu "github.com/pachyderm/pachyderm/src/server/pkg/testutil"
    18  
    19  	"github.com/pachyderm/pachyderm/src/client/pkg/errors"
    20  	prom_api "github.com/prometheus/client_golang/api"
    21  	prom_api_v1 "github.com/prometheus/client_golang/api/prometheus/v1"
    22  	prom_model "github.com/prometheus/common/model"
    23  )
    24  
    25  func TestPrometheusStats(t *testing.T) {
    26  	c := tu.GetPachClient(t)
    27  	defer require.NoError(t, c.DeleteAll())
    28  	require.NoError(t, tu.ActivateEnterprise(t, c))
    29  
    30  	dataRepo := tu.UniqueString("TestSimplePipeline_data")
    31  	require.NoError(t, c.CreateRepo(dataRepo))
    32  
    33  	pipeline := tu.UniqueString("TestSimplePipeline")
    34  	// We want several commits (for multiple jobs) and several datums per job
    35  	// For semi meaningful time series results
    36  	numCommits := 5
    37  	numDatums := 10
    38  
    39  	_, err := c.PpsAPIClient.CreatePipeline(
    40  		c.Ctx(),
    41  		&pps.CreatePipelineRequest{
    42  			Pipeline: client.NewPipeline(pipeline),
    43  			Transform: &pps.Transform{
    44  				Cmd: []string{"bash"},
    45  				Stdin: []string{
    46  					// We want non const runtime vals so histogram rate queries return
    47  					// real results (not NaNs):
    48  					"sleep $(( ( RANDOM % 10 )  + 1 ))",
    49  					// Include case where we will err:
    50  					fmt.Sprintf("touch /pfs/%s/test; if [[ $(cat /pfs/%s/test) == 'fail' ]]; then echo 'Failing'; exit 1; fi", dataRepo, dataRepo),
    51  					fmt.Sprintf("cp /pfs/%s/* /pfs/out/", dataRepo),
    52  				},
    53  			},
    54  			ParallelismSpec: &pps.ParallelismSpec{
    55  				Constant: uint64(numDatums),
    56  			},
    57  			Input:        client.NewPFSInput(dataRepo, "/*"),
    58  			OutputBranch: "",
    59  			Update:       false,
    60  			EnableStats:  true,
    61  		},
    62  	)
    63  	require.NoError(t, err)
    64  
    65  	var commit *pfs.Commit
    66  	// Do numCommits-1 commits w good data
    67  	for i := 0; i < numCommits-1; i++ {
    68  		commit, err = c.StartCommit(dataRepo, "master")
    69  		require.NoError(t, err)
    70  		// We want several datums per job so that we have multiple data points
    71  		// per job time series
    72  		for j := 0; j < numDatums; j++ {
    73  			_, err = c.PutFile(dataRepo, commit.ID, fmt.Sprintf("file%v", j), strings.NewReader("bar"))
    74  			require.NoError(t, err)
    75  		}
    76  		require.NoError(t, err)
    77  		require.NoError(t, c.FinishCommit(dataRepo, commit.ID))
    78  		// Prometheus scrapes every 10s
    79  		// We run a new job outside this window so that we see a more organic
    80  		// time series
    81  		time.Sleep(15 * time.Second)
    82  	}
    83  	// Now write data that'll make the job fail
    84  	commit, err = c.StartCommit(dataRepo, "master")
    85  	require.NoError(t, err)
    86  	_, err = c.PutFile(dataRepo, commit.ID, "test", strings.NewReader("fail"))
    87  	require.NoError(t, err)
    88  	require.NoError(t, c.FinishCommit(dataRepo, commit.ID))
    89  
    90  	_, err = c.FlushCommit([]*pfs.Commit{commit}, nil)
    91  	require.NoError(t, err)
    92  
    93  	port := os.Getenv("PROM_PORT")
    94  	promClient, err := prom_api.NewClient(prom_api.Config{
    95  		Address: fmt.Sprintf("http://127.0.0.1:%v", port),
    96  	})
    97  	require.NoError(t, err)
    98  	promAPI := prom_api_v1.NewAPI(promClient)
    99  
   100  	// Prometheus scrapes ~ every 15s, but empirically, we miss data unless I
   101  	// wait this long. This is annoying, and also why this is a single giant
   102  	// test, not many little ones
   103  	time.Sleep(45 * time.Second)
   104  
   105  	datumCountQuery := func(t *testing.T, query string) float64 {
   106  		result, _, err := promAPI.Query(context.Background(), query, time.Now())
   107  		require.NoError(t, err)
   108  		resultVec := result.(prom_model.Vector)
   109  		require.Equal(t, 1, len(resultVec))
   110  		return float64(resultVec[0].Value)
   111  	}
   112  	// Datum count queries
   113  	t.Run("DatumCountStarted", func(t *testing.T) {
   114  		query := fmt.Sprintf("sum(pachyderm_worker_datum_count{pipelineName=\"%v\", state=\"started\"})", pipeline)
   115  		result := datumCountQuery(t, query)
   116  		require.Equal(t, float64((numCommits-1)*numDatums+3), result) // 3 extra for failed datum restarts on the last job
   117  	})
   118  
   119  	t.Run("DatumCountFinished", func(t *testing.T) {
   120  		query := fmt.Sprintf("sum(pachyderm_worker_datum_count{pipelineName=\"%v\", state=\"finished\"})", pipeline)
   121  		result := datumCountQuery(t, query)
   122  		require.Equal(t, float64((numCommits-1)*numDatums), result)
   123  	})
   124  
   125  	t.Run("DatumCountErrored", func(t *testing.T) {
   126  		query := fmt.Sprintf("sum(pachyderm_worker_datum_count{pipelineName=\"%v\", state=\"errored\"})", pipeline)
   127  		result := datumCountQuery(t, query)
   128  		require.Equal(t, float64(3.0), result)
   129  	})
   130  
   131  	// Bytes Counters
   132  	t.Run("DatumDownloadBytes", func(t *testing.T) {
   133  		query := fmt.Sprintf("sum(pachyderm_worker_datum_download_bytes_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline)
   134  		result := datumCountQuery(t, query)
   135  		// Each run adds 30 bytes to the total
   136  		require.Equal(t, float64(30.0*(numCommits-1.0)*numCommits/2.0), result)
   137  	})
   138  	t.Run("DatumUploadBytes", func(t *testing.T) {
   139  		query := fmt.Sprintf("sum(pachyderm_worker_datum_upload_bytes_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline)
   140  		result := datumCountQuery(t, query)
   141  		// Each run adds 30 bytes to the total
   142  		require.Equal(t, float64(30.0*(numCommits-1.0)*numCommits/2.0), result)
   143  	})
   144  
   145  	// Time Counters
   146  	t.Run("DatumUploadSeconds", func(t *testing.T) {
   147  		query := fmt.Sprintf("sum(pachyderm_worker_datum_upload_seconds_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline)
   148  		datumCountQuery(t, query) // Just check query has a result
   149  	})
   150  	t.Run("DatumProcSeconds", func(t *testing.T) {
   151  		query := fmt.Sprintf("sum(pachyderm_worker_datum_proc_seconds_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline)
   152  		datumCountQuery(t, query) // Just check query has a result
   153  	})
   154  	t.Run("DatumDownloadSeconds", func(t *testing.T) {
   155  		query := fmt.Sprintf("sum(pachyderm_worker_datum_download_seconds_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline)
   156  		datumCountQuery(t, query) // Just check query has a result
   157  	})
   158  
   159  	// Test queries across all jobs
   160  	filter := "(instance,exported_job)"
   161  	// 'instance' is an auto recorded label w the IP of the pod ... this will
   162  	// become helpful when debugging certain workers. For now, we filter it out
   163  	// to see results across instances
   164  	// 'exported_job' is just the job ID, but is named as such because 'job' is
   165  	// a reserved keyword for prometheus labels. We filter it out so we see
   166  	// results across all jobs
   167  
   168  	// Avg Datum Time Queries
   169  	avgDatumQuery := func(t *testing.T, sumQuery string, countQuery string, expected int) {
   170  		query := "(" + sumQuery + ")/(" + countQuery + ")"
   171  		result, _, err := promAPI.Query(context.Background(), query, time.Now())
   172  		require.NoError(t, err)
   173  		resultVec := result.(prom_model.Vector)
   174  		require.Equal(t, expected, len(resultVec))
   175  	}
   176  	for _, segment := range []string{"download", "upload"} {
   177  		t.Run(fmt.Sprintf("AcrossJobsDatumTime=%v", segment), func(t *testing.T) {
   178  			sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   179  			count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   180  			avgDatumQuery(t, sum, count, 1)
   181  		})
   182  	}
   183  	segment := "proc"
   184  	t.Run(fmt.Sprintf("AcrossJobsDatumTime=%v", segment), func(t *testing.T) {
   185  		sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   186  		count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   187  		// sum gets aggregated no matter how many datums ran, but we get one result for finished datums, and one for errored datums
   188  		avgDatumQuery(t, sum, count, 2)
   189  	})
   190  
   191  	// Avg Datum Size Queries
   192  	for _, segment := range []string{"download", "upload"} {
   193  		t.Run(fmt.Sprintf("AcrossJobsDatumSize=%v", segment), func(t *testing.T) {
   194  			sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_size_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   195  			count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_size_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   196  			avgDatumQuery(t, sum, count, 1)
   197  		})
   198  	}
   199  
   200  	// Now test aggregating per job
   201  	filter = "(instance)"
   202  
   203  	// Avg Datum Time Queries
   204  	expectedCounts := map[string]int{
   205  		"download": numCommits + 1, // We expect 5 jobs, plus there's always an extra value w no job label
   206  		"upload":   numCommits - 1, // Since 1 job failed, there will only be 4 upload times
   207  	}
   208  	for _, segment := range []string{"download", "upload"} {
   209  		t.Run(fmt.Sprintf("PerJobDatumTime=%v", segment), func(t *testing.T) {
   210  			sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   211  			count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   212  			avgDatumQuery(t, sum, count, expectedCounts[segment])
   213  		})
   214  	}
   215  	segment = "proc"
   216  	t.Run(fmt.Sprintf("PerJobDatumTime=%v", segment), func(t *testing.T) {
   217  		sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   218  		count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   219  		avgDatumQuery(t, sum, count, numCommits)
   220  	})
   221  
   222  	// Avg Datum Size Queries
   223  	expectedCounts = map[string]int{
   224  		"download": numCommits - 1, // Download size gets reported after job completion, and one job fails
   225  		"upload":   numCommits - 1, // Since 1 job failed, there will only be 4 upload times
   226  	}
   227  	for _, segment := range []string{"download", "upload"} {
   228  		t.Run(fmt.Sprintf("PerJobDatumSize=%v", segment), func(t *testing.T) {
   229  			sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_size_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   230  			count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_size_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter)
   231  			avgDatumQuery(t, sum, count, expectedCounts[segment])
   232  		})
   233  	}
   234  }
   235  
   236  // Regression: stats commits would not close when there were no input datums.
   237  //For more info, see github.com/pachyderm/pachyderm/issues/3337
   238  func TestCloseStatsCommitWithNoInputDatums(t *testing.T) {
   239  	c := tu.GetPachClient(t)
   240  	defer require.NoError(t, c.DeleteAll())
   241  	require.NoError(t, tu.ActivateEnterprise(t, c))
   242  
   243  	dataRepo := tu.UniqueString("TestSimplePipeline_data")
   244  	require.NoError(t, c.CreateRepo(dataRepo))
   245  
   246  	pipeline := tu.UniqueString("TestSimplePipeline")
   247  
   248  	_, err := c.PpsAPIClient.CreatePipeline(
   249  		c.Ctx(),
   250  		&pps.CreatePipelineRequest{
   251  			Pipeline: client.NewPipeline(pipeline),
   252  			Transform: &pps.Transform{
   253  				Cmd:   []string{"bash"},
   254  				Stdin: []string{"sleep 1"},
   255  			},
   256  			Input:        client.NewPFSInput(dataRepo, "/*"),
   257  			OutputBranch: "",
   258  			Update:       false,
   259  			EnableStats:  true,
   260  		},
   261  	)
   262  	require.NoError(t, err)
   263  
   264  	commit, err := c.StartCommit(dataRepo, "master")
   265  	require.NoError(t, err)
   266  	require.NoError(t, c.FinishCommit(dataRepo, commit.ID))
   267  
   268  	// If the error exists, the stats commit will never close, and this will
   269  	// timeout
   270  	commitIter, err := c.FlushCommit([]*pfs.Commit{commit}, nil)
   271  	require.NoError(t, err)
   272  
   273  	for {
   274  		_, err := commitIter.Next()
   275  		if errors.Is(err, io.EOF) {
   276  			break
   277  		}
   278  		require.NoError(t, err)
   279  	}
   280  
   281  	// Make sure the job succeeded as well
   282  	jobs, err := c.ListJob(pipeline, nil, nil, -1, true)
   283  	require.NoError(t, err)
   284  	require.Equal(t, 1, len(jobs))
   285  	jobInfo, err := c.InspectJob(jobs[0].Job.ID, true)
   286  	require.NoError(t, err)
   287  	require.Equal(t, pps.JobState_JOB_SUCCESS, jobInfo.State)
   288  }