github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/cloudml/cloudml_benchmark_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  import os
    18  import time
    19  import unittest
    20  import uuid
    21  
    22  import pytest
    23  
    24  try:
    25    import apache_beam.testing.benchmarks.cloudml.cloudml_benchmark_constants_lib as lib
    26    from apache_beam.testing.benchmarks.cloudml.pipelines import workflow
    27    from apache_beam.testing.load_tests.load_test_metrics_utils import InfluxDBMetricsPublisherOptions
    28    from apache_beam.testing.load_tests.load_test_metrics_utils import MetricsReader
    29    from apache_beam.testing.test_pipeline import TestPipeline
    30  except ImportError:  # pylint: disable=bare-except
    31    raise unittest.SkipTest('Dependencies are not installed')
    32  
    33  _INPUT_GCS_BUCKET_ROOT = 'gs://apache-beam-ml/datasets/cloudml/criteo'
    34  _CRITEO_FEATURES_FILE = 'testdata/criteo/expected/features.tfrecord.gz'
    35  _OUTPUT_GCS_BUCKET_ROOT = 'gs://temp-storage-for-end-to-end-tests/tft/'
    36  
    37  
    38  def _publish_metrics(pipeline, metric_value, metrics_table, metric_name):
    39    influx_options = InfluxDBMetricsPublisherOptions(
    40        metrics_table,
    41        pipeline.get_option('influx_db_name'),
    42        pipeline.get_option('influx_hostname'),
    43        os.getenv('INFLUXDB_USER'),
    44        os.getenv('INFLUXDB_USER_PASSWORD'),
    45    )
    46    metric_reader = MetricsReader(
    47        project_name=pipeline.get_option('project'),
    48        bq_table=metrics_table,
    49        bq_dataset=pipeline.get_option('metrics_dataset'),
    50        publish_to_bq=True,
    51        influxdb_options=influx_options,
    52    )
    53    metric_reader.publish_values([(
    54        metric_name,
    55        metric_value,
    56    )])
    57  
    58  
    59  @pytest.mark.uses_tft
    60  class CloudMLTFTBenchmarkTest(unittest.TestCase):
    61    def test_cloudml_benchmark_criteo_small(self):
    62      test_pipeline = TestPipeline(is_integration_test=True)
    63      extra_opts = {}
    64      extra_opts['input'] = os.path.join(
    65          _INPUT_GCS_BUCKET_ROOT, lib.INPUT_CRITEO_SMALL)
    66      extra_opts['benchmark_type'] = 'tft'
    67      extra_opts['classifier'] = 'criteo'
    68      extra_opts['frequency_threshold'] = 0
    69      extra_opts['output'] = os.path.join(
    70          _OUTPUT_GCS_BUCKET_ROOT, uuid.uuid4().hex)
    71      start_time = time.time()
    72      workflow.run(test_pipeline.get_full_options_as_args(**extra_opts))
    73      end_time = time.time()
    74  
    75      metrics_table = 'cloudml_benchmark_criteo_small'
    76      _publish_metrics(
    77          pipeline=test_pipeline,
    78          metric_value=end_time - start_time,
    79          metrics_table=metrics_table,
    80          metric_name='runtime_sec')
    81  
    82    def test_cloudml_benchmark_cirteo_no_shuffle_10GB(self):
    83      test_pipeline = TestPipeline(is_integration_test=True)
    84      extra_opts = {}
    85      extra_opts['input'] = os.path.join(
    86          _INPUT_GCS_BUCKET_ROOT, lib.INPUT_CRITEO_10GB)
    87      extra_opts['benchmark_type'] = 'tft'
    88      extra_opts['classifier'] = 'criteo'
    89      extra_opts['frequency_threshold'] = 0
    90      extra_opts['output'] = os.path.join(
    91          _OUTPUT_GCS_BUCKET_ROOT, uuid.uuid4().hex)
    92      extra_opts['shuffle'] = False
    93      start_time = time.time()
    94      workflow.run(test_pipeline.get_full_options_as_args(**extra_opts))
    95      end_time = time.time()
    96  
    97      metrics_table = 'cloudml_benchmark_cirteo_no_shuffle_10GB'
    98      _publish_metrics(
    99          pipeline=test_pipeline,
   100          metric_value=end_time - start_time,
   101          metrics_table=metrics_table,
   102          metric_name='runtime_sec')
   103  
   104    def test_cloudml_benchmark_criteo_10GB(self):
   105      test_pipeline = TestPipeline(is_integration_test=True)
   106      extra_opts = {}
   107      extra_opts['input'] = os.path.join(
   108          _INPUT_GCS_BUCKET_ROOT, lib.INPUT_CRITEO_10GB)
   109      extra_opts['benchmark_type'] = 'tft'
   110      extra_opts['classifier'] = 'criteo'
   111      extra_opts['frequency_threshold'] = 0
   112      extra_opts['output'] = os.path.join(
   113          _OUTPUT_GCS_BUCKET_ROOT, uuid.uuid4().hex)
   114      start_time = time.time()
   115      workflow.run(test_pipeline.get_full_options_as_args(**extra_opts))
   116      end_time = time.time()
   117  
   118      metrics_table = 'cloudml_benchmark_criteo_10GB'
   119      _publish_metrics(
   120          pipeline=test_pipeline,
   121          metric_value=end_time - start_time,
   122          metrics_table=metrics_table,
   123          metric_name='runtime_sec')
   124  
   125  
   126  if __name__ == '__main__':
   127    unittest.main()