github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/cloudml/cloudml_benchmark_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 import os 18 import time 19 import unittest 20 import uuid 21 22 import pytest 23 24 try: 25 import apache_beam.testing.benchmarks.cloudml.cloudml_benchmark_constants_lib as lib 26 from apache_beam.testing.benchmarks.cloudml.pipelines import workflow 27 from apache_beam.testing.load_tests.load_test_metrics_utils import InfluxDBMetricsPublisherOptions 28 from apache_beam.testing.load_tests.load_test_metrics_utils import MetricsReader 29 from apache_beam.testing.test_pipeline import TestPipeline 30 except ImportError: # pylint: disable=bare-except 31 raise unittest.SkipTest('Dependencies are not installed') 32 33 _INPUT_GCS_BUCKET_ROOT = 'gs://apache-beam-ml/datasets/cloudml/criteo' 34 _CRITEO_FEATURES_FILE = 'testdata/criteo/expected/features.tfrecord.gz' 35 _OUTPUT_GCS_BUCKET_ROOT = 'gs://temp-storage-for-end-to-end-tests/tft/' 36 37 38 def _publish_metrics(pipeline, metric_value, metrics_table, metric_name): 39 influx_options = InfluxDBMetricsPublisherOptions( 40 metrics_table, 41 pipeline.get_option('influx_db_name'), 42 pipeline.get_option('influx_hostname'), 43 os.getenv('INFLUXDB_USER'), 44 os.getenv('INFLUXDB_USER_PASSWORD'), 45 ) 46 metric_reader = MetricsReader( 47 project_name=pipeline.get_option('project'), 48 bq_table=metrics_table, 49 bq_dataset=pipeline.get_option('metrics_dataset'), 50 publish_to_bq=True, 51 influxdb_options=influx_options, 52 ) 53 metric_reader.publish_values([( 54 metric_name, 55 metric_value, 56 )]) 57 58 59 @pytest.mark.uses_tft 60 class CloudMLTFTBenchmarkTest(unittest.TestCase): 61 def test_cloudml_benchmark_criteo_small(self): 62 test_pipeline = TestPipeline(is_integration_test=True) 63 extra_opts = {} 64 extra_opts['input'] = os.path.join( 65 _INPUT_GCS_BUCKET_ROOT, lib.INPUT_CRITEO_SMALL) 66 extra_opts['benchmark_type'] = 'tft' 67 extra_opts['classifier'] = 'criteo' 68 extra_opts['frequency_threshold'] = 0 69 extra_opts['output'] = os.path.join( 70 _OUTPUT_GCS_BUCKET_ROOT, uuid.uuid4().hex) 71 start_time = time.time() 72 workflow.run(test_pipeline.get_full_options_as_args(**extra_opts)) 73 end_time = time.time() 74 75 metrics_table = 'cloudml_benchmark_criteo_small' 76 _publish_metrics( 77 pipeline=test_pipeline, 78 metric_value=end_time - start_time, 79 metrics_table=metrics_table, 80 metric_name='runtime_sec') 81 82 def test_cloudml_benchmark_cirteo_no_shuffle_10GB(self): 83 test_pipeline = TestPipeline(is_integration_test=True) 84 extra_opts = {} 85 extra_opts['input'] = os.path.join( 86 _INPUT_GCS_BUCKET_ROOT, lib.INPUT_CRITEO_10GB) 87 extra_opts['benchmark_type'] = 'tft' 88 extra_opts['classifier'] = 'criteo' 89 extra_opts['frequency_threshold'] = 0 90 extra_opts['output'] = os.path.join( 91 _OUTPUT_GCS_BUCKET_ROOT, uuid.uuid4().hex) 92 extra_opts['shuffle'] = False 93 start_time = time.time() 94 workflow.run(test_pipeline.get_full_options_as_args(**extra_opts)) 95 end_time = time.time() 96 97 metrics_table = 'cloudml_benchmark_cirteo_no_shuffle_10GB' 98 _publish_metrics( 99 pipeline=test_pipeline, 100 metric_value=end_time - start_time, 101 metrics_table=metrics_table, 102 metric_name='runtime_sec') 103 104 def test_cloudml_benchmark_criteo_10GB(self): 105 test_pipeline = TestPipeline(is_integration_test=True) 106 extra_opts = {} 107 extra_opts['input'] = os.path.join( 108 _INPUT_GCS_BUCKET_ROOT, lib.INPUT_CRITEO_10GB) 109 extra_opts['benchmark_type'] = 'tft' 110 extra_opts['classifier'] = 'criteo' 111 extra_opts['frequency_threshold'] = 0 112 extra_opts['output'] = os.path.join( 113 _OUTPUT_GCS_BUCKET_ROOT, uuid.uuid4().hex) 114 start_time = time.time() 115 workflow.run(test_pipeline.get_full_options_as_args(**extra_opts)) 116 end_time = time.time() 117 118 metrics_table = 'cloudml_benchmark_criteo_10GB' 119 _publish_metrics( 120 pipeline=test_pipeline, 121 metric_value=end_time - start_time, 122 metrics_table=metrics_table, 123 metric_name='runtime_sec') 124 125 126 if __name__ == '__main__': 127 unittest.main()