github.com/pachyderm/pachyderm@v1.13.4/src/server/worker/stats/stats_test.go (about) 1 package stats 2 3 import ( 4 "fmt" 5 "io" 6 "os" 7 "strings" 8 "testing" 9 "time" 10 11 "golang.org/x/net/context" 12 13 "github.com/pachyderm/pachyderm/src/client" 14 "github.com/pachyderm/pachyderm/src/client/pfs" 15 "github.com/pachyderm/pachyderm/src/client/pkg/require" 16 "github.com/pachyderm/pachyderm/src/client/pps" 17 tu "github.com/pachyderm/pachyderm/src/server/pkg/testutil" 18 19 "github.com/pachyderm/pachyderm/src/client/pkg/errors" 20 prom_api "github.com/prometheus/client_golang/api" 21 prom_api_v1 "github.com/prometheus/client_golang/api/prometheus/v1" 22 prom_model "github.com/prometheus/common/model" 23 ) 24 25 func TestPrometheusStats(t *testing.T) { 26 c := tu.GetPachClient(t) 27 defer require.NoError(t, c.DeleteAll()) 28 require.NoError(t, tu.ActivateEnterprise(t, c)) 29 30 dataRepo := tu.UniqueString("TestSimplePipeline_data") 31 require.NoError(t, c.CreateRepo(dataRepo)) 32 33 pipeline := tu.UniqueString("TestSimplePipeline") 34 // We want several commits (for multiple jobs) and several datums per job 35 // For semi meaningful time series results 36 numCommits := 5 37 numDatums := 10 38 39 _, err := c.PpsAPIClient.CreatePipeline( 40 c.Ctx(), 41 &pps.CreatePipelineRequest{ 42 Pipeline: client.NewPipeline(pipeline), 43 Transform: &pps.Transform{ 44 Cmd: []string{"bash"}, 45 Stdin: []string{ 46 // We want non const runtime vals so histogram rate queries return 47 // real results (not NaNs): 48 "sleep $(( ( RANDOM % 10 ) + 1 ))", 49 // Include case where we will err: 50 fmt.Sprintf("touch /pfs/%s/test; if [[ $(cat /pfs/%s/test) == 'fail' ]]; then echo 'Failing'; exit 1; fi", dataRepo, dataRepo), 51 fmt.Sprintf("cp /pfs/%s/* /pfs/out/", dataRepo), 52 }, 53 }, 54 ParallelismSpec: &pps.ParallelismSpec{ 55 Constant: uint64(numDatums), 56 }, 57 Input: client.NewPFSInput(dataRepo, "/*"), 58 OutputBranch: "", 59 Update: false, 60 EnableStats: true, 61 }, 62 ) 63 require.NoError(t, err) 64 65 var commit *pfs.Commit 66 // Do numCommits-1 commits w good data 67 for i := 0; i < numCommits-1; i++ { 68 commit, err = c.StartCommit(dataRepo, "master") 69 require.NoError(t, err) 70 // We want several datums per job so that we have multiple data points 71 // per job time series 72 for j := 0; j < numDatums; j++ { 73 _, err = c.PutFile(dataRepo, commit.ID, fmt.Sprintf("file%v", j), strings.NewReader("bar")) 74 require.NoError(t, err) 75 } 76 require.NoError(t, err) 77 require.NoError(t, c.FinishCommit(dataRepo, commit.ID)) 78 // Prometheus scrapes every 10s 79 // We run a new job outside this window so that we see a more organic 80 // time series 81 time.Sleep(15 * time.Second) 82 } 83 // Now write data that'll make the job fail 84 commit, err = c.StartCommit(dataRepo, "master") 85 require.NoError(t, err) 86 _, err = c.PutFile(dataRepo, commit.ID, "test", strings.NewReader("fail")) 87 require.NoError(t, err) 88 require.NoError(t, c.FinishCommit(dataRepo, commit.ID)) 89 90 _, err = c.FlushCommit([]*pfs.Commit{commit}, nil) 91 require.NoError(t, err) 92 93 port := os.Getenv("PROM_PORT") 94 promClient, err := prom_api.NewClient(prom_api.Config{ 95 Address: fmt.Sprintf("http://127.0.0.1:%v", port), 96 }) 97 require.NoError(t, err) 98 promAPI := prom_api_v1.NewAPI(promClient) 99 100 // Prometheus scrapes ~ every 15s, but empirically, we miss data unless I 101 // wait this long. This is annoying, and also why this is a single giant 102 // test, not many little ones 103 time.Sleep(45 * time.Second) 104 105 datumCountQuery := func(t *testing.T, query string) float64 { 106 result, _, err := promAPI.Query(context.Background(), query, time.Now()) 107 require.NoError(t, err) 108 resultVec := result.(prom_model.Vector) 109 require.Equal(t, 1, len(resultVec)) 110 return float64(resultVec[0].Value) 111 } 112 // Datum count queries 113 t.Run("DatumCountStarted", func(t *testing.T) { 114 query := fmt.Sprintf("sum(pachyderm_worker_datum_count{pipelineName=\"%v\", state=\"started\"})", pipeline) 115 result := datumCountQuery(t, query) 116 require.Equal(t, float64((numCommits-1)*numDatums+3), result) // 3 extra for failed datum restarts on the last job 117 }) 118 119 t.Run("DatumCountFinished", func(t *testing.T) { 120 query := fmt.Sprintf("sum(pachyderm_worker_datum_count{pipelineName=\"%v\", state=\"finished\"})", pipeline) 121 result := datumCountQuery(t, query) 122 require.Equal(t, float64((numCommits-1)*numDatums), result) 123 }) 124 125 t.Run("DatumCountErrored", func(t *testing.T) { 126 query := fmt.Sprintf("sum(pachyderm_worker_datum_count{pipelineName=\"%v\", state=\"errored\"})", pipeline) 127 result := datumCountQuery(t, query) 128 require.Equal(t, float64(3.0), result) 129 }) 130 131 // Bytes Counters 132 t.Run("DatumDownloadBytes", func(t *testing.T) { 133 query := fmt.Sprintf("sum(pachyderm_worker_datum_download_bytes_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline) 134 result := datumCountQuery(t, query) 135 // Each run adds 30 bytes to the total 136 require.Equal(t, float64(30.0*(numCommits-1.0)*numCommits/2.0), result) 137 }) 138 t.Run("DatumUploadBytes", func(t *testing.T) { 139 query := fmt.Sprintf("sum(pachyderm_worker_datum_upload_bytes_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline) 140 result := datumCountQuery(t, query) 141 // Each run adds 30 bytes to the total 142 require.Equal(t, float64(30.0*(numCommits-1.0)*numCommits/2.0), result) 143 }) 144 145 // Time Counters 146 t.Run("DatumUploadSeconds", func(t *testing.T) { 147 query := fmt.Sprintf("sum(pachyderm_worker_datum_upload_seconds_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline) 148 datumCountQuery(t, query) // Just check query has a result 149 }) 150 t.Run("DatumProcSeconds", func(t *testing.T) { 151 query := fmt.Sprintf("sum(pachyderm_worker_datum_proc_seconds_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline) 152 datumCountQuery(t, query) // Just check query has a result 153 }) 154 t.Run("DatumDownloadSeconds", func(t *testing.T) { 155 query := fmt.Sprintf("sum(pachyderm_worker_datum_download_seconds_count{pipelineName=\"%v\"}) without (instance, exported_job)", pipeline) 156 datumCountQuery(t, query) // Just check query has a result 157 }) 158 159 // Test queries across all jobs 160 filter := "(instance,exported_job)" 161 // 'instance' is an auto recorded label w the IP of the pod ... this will 162 // become helpful when debugging certain workers. For now, we filter it out 163 // to see results across instances 164 // 'exported_job' is just the job ID, but is named as such because 'job' is 165 // a reserved keyword for prometheus labels. We filter it out so we see 166 // results across all jobs 167 168 // Avg Datum Time Queries 169 avgDatumQuery := func(t *testing.T, sumQuery string, countQuery string, expected int) { 170 query := "(" + sumQuery + ")/(" + countQuery + ")" 171 result, _, err := promAPI.Query(context.Background(), query, time.Now()) 172 require.NoError(t, err) 173 resultVec := result.(prom_model.Vector) 174 require.Equal(t, expected, len(resultVec)) 175 } 176 for _, segment := range []string{"download", "upload"} { 177 t.Run(fmt.Sprintf("AcrossJobsDatumTime=%v", segment), func(t *testing.T) { 178 sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 179 count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 180 avgDatumQuery(t, sum, count, 1) 181 }) 182 } 183 segment := "proc" 184 t.Run(fmt.Sprintf("AcrossJobsDatumTime=%v", segment), func(t *testing.T) { 185 sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 186 count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 187 // sum gets aggregated no matter how many datums ran, but we get one result for finished datums, and one for errored datums 188 avgDatumQuery(t, sum, count, 2) 189 }) 190 191 // Avg Datum Size Queries 192 for _, segment := range []string{"download", "upload"} { 193 t.Run(fmt.Sprintf("AcrossJobsDatumSize=%v", segment), func(t *testing.T) { 194 sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_size_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 195 count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_size_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 196 avgDatumQuery(t, sum, count, 1) 197 }) 198 } 199 200 // Now test aggregating per job 201 filter = "(instance)" 202 203 // Avg Datum Time Queries 204 expectedCounts := map[string]int{ 205 "download": numCommits + 1, // We expect 5 jobs, plus there's always an extra value w no job label 206 "upload": numCommits - 1, // Since 1 job failed, there will only be 4 upload times 207 } 208 for _, segment := range []string{"download", "upload"} { 209 t.Run(fmt.Sprintf("PerJobDatumTime=%v", segment), func(t *testing.T) { 210 sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 211 count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 212 avgDatumQuery(t, sum, count, expectedCounts[segment]) 213 }) 214 } 215 segment = "proc" 216 t.Run(fmt.Sprintf("PerJobDatumTime=%v", segment), func(t *testing.T) { 217 sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 218 count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_time_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 219 avgDatumQuery(t, sum, count, numCommits) 220 }) 221 222 // Avg Datum Size Queries 223 expectedCounts = map[string]int{ 224 "download": numCommits - 1, // Download size gets reported after job completion, and one job fails 225 "upload": numCommits - 1, // Since 1 job failed, there will only be 4 upload times 226 } 227 for _, segment := range []string{"download", "upload"} { 228 t.Run(fmt.Sprintf("PerJobDatumSize=%v", segment), func(t *testing.T) { 229 sum := fmt.Sprintf("sum(pachyderm_worker_datum_%v_size_sum{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 230 count := fmt.Sprintf("sum(pachyderm_worker_datum_%v_size_count{pipelineName=\"%v\"}) without %v", segment, pipeline, filter) 231 avgDatumQuery(t, sum, count, expectedCounts[segment]) 232 }) 233 } 234 } 235 236 // Regression: stats commits would not close when there were no input datums. 237 //For more info, see github.com/pachyderm/pachyderm/issues/3337 238 func TestCloseStatsCommitWithNoInputDatums(t *testing.T) { 239 c := tu.GetPachClient(t) 240 defer require.NoError(t, c.DeleteAll()) 241 require.NoError(t, tu.ActivateEnterprise(t, c)) 242 243 dataRepo := tu.UniqueString("TestSimplePipeline_data") 244 require.NoError(t, c.CreateRepo(dataRepo)) 245 246 pipeline := tu.UniqueString("TestSimplePipeline") 247 248 _, err := c.PpsAPIClient.CreatePipeline( 249 c.Ctx(), 250 &pps.CreatePipelineRequest{ 251 Pipeline: client.NewPipeline(pipeline), 252 Transform: &pps.Transform{ 253 Cmd: []string{"bash"}, 254 Stdin: []string{"sleep 1"}, 255 }, 256 Input: client.NewPFSInput(dataRepo, "/*"), 257 OutputBranch: "", 258 Update: false, 259 EnableStats: true, 260 }, 261 ) 262 require.NoError(t, err) 263 264 commit, err := c.StartCommit(dataRepo, "master") 265 require.NoError(t, err) 266 require.NoError(t, c.FinishCommit(dataRepo, commit.ID)) 267 268 // If the error exists, the stats commit will never close, and this will 269 // timeout 270 commitIter, err := c.FlushCommit([]*pfs.Commit{commit}, nil) 271 require.NoError(t, err) 272 273 for { 274 _, err := commitIter.Next() 275 if errors.Is(err, io.EOF) { 276 break 277 } 278 require.NoError(t, err) 279 } 280 281 // Make sure the job succeeded as well 282 jobs, err := c.ListJob(pipeline, nil, nil, -1, true) 283 require.NoError(t, err) 284 require.Equal(t, 1, len(jobs)) 285 jobInfo, err := c.InspectJob(jobs[0].Job.ID, true) 286 require.NoError(t, err) 287 require.Equal(t, pps.JobState_JOB_SUCCESS, jobInfo.State) 288 }