github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigtableio_it_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigtableio_it_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Integration tests for bigtableio."""
    19  # pytype: skip-file
    20  
    21  import datetime
    22  import logging
    23  import random
    24  import string
    25  import unittest
    26  import uuid
    27  from typing import TYPE_CHECKING
    28  from typing import List
    29  
    30  import pytest
    31  import pytz
    32  
    33  import apache_beam as beam
    34  from apache_beam.io.gcp.bigtableio import WriteToBigTable
    35  from apache_beam.metrics.metric import MetricsFilter
    36  from apache_beam.options.pipeline_options import PipelineOptions
    37  from apache_beam.runners.runner import PipelineState
    38  from apache_beam.testing.test_pipeline import TestPipeline
    39  
    40  # Protect against environments where bigtable library is not available.
    41  # pylint: disable=wrong-import-order, wrong-import-position
    42  try:
    43    from google.cloud._helpers import _datetime_from_microseconds
    44    from google.cloud._helpers import _microseconds_from_datetime
    45    from google.cloud._helpers import UTC
    46    from google.cloud.bigtable import row, column_family, Client
    47  except ImportError:
    48    Client = None
    49    UTC = pytz.utc
    50    _microseconds_from_datetime = lambda label_stamp: label_stamp
    51    _datetime_from_microseconds = lambda micro: micro
    52  
    53  if TYPE_CHECKING:
    54    import google.cloud.bigtable.instance
    55  
    56  EXISTING_INSTANCES = []  # type: List[google.cloud.bigtable.instance.Instance]
    57  LABEL_KEY = u'python-bigtable-beam'
    58  label_stamp = datetime.datetime.utcnow().replace(tzinfo=UTC)
    59  label_stamp_micros = _microseconds_from_datetime(label_stamp)
    60  LABELS = {LABEL_KEY: str(label_stamp_micros)}
    61  
    62  
    63  class GenerateTestRows(beam.PTransform):
    64    """ A transform test to run write to the Bigtable Table.
    65  
    66    A PTransform that generate a list of `DirectRow` to write it in
    67    Bigtable Table.
    68  
    69    """
    70    def __init__(self, number, project_id=None, instance_id=None, table_id=None):
    71      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
    72      # super().__init__()
    73      beam.PTransform.__init__(self)
    74      self.number = number
    75      self.rand = random.choice(string.ascii_letters + string.digits)
    76      self.column_family_id = 'cf1'
    77      self.beam_options = {
    78          'project_id': project_id,
    79          'instance_id': instance_id,
    80          'table_id': table_id
    81      }
    82  
    83    def _generate(self):
    84      value = ''.join(self.rand for i in range(100))
    85  
    86      for index in range(self.number):
    87        key = "beam_key%s" % ('{0:07}'.format(index))
    88        direct_row = row.DirectRow(row_key=key)
    89        for column_id in range(10):
    90          direct_row.set_cell(
    91              self.column_family_id, ('field%s' % column_id).encode('utf-8'),
    92              value,
    93              datetime.datetime.now())
    94        yield direct_row
    95  
    96    def expand(self, pvalue):
    97      beam_options = self.beam_options
    98      return (
    99          pvalue
   100          | beam.Create(self._generate())
   101          | WriteToBigTable(
   102              beam_options['project_id'],
   103              beam_options['instance_id'],
   104              beam_options['table_id']))
   105  
   106  
   107  @unittest.skipIf(Client is None, 'GCP Bigtable dependencies are not installed')
   108  class BigtableIOWriteTest(unittest.TestCase):
   109    """ Bigtable Write Connector Test
   110  
   111    """
   112    DEFAULT_TABLE_PREFIX = "python-test"
   113    instance_id = DEFAULT_TABLE_PREFIX + "-" + str(uuid.uuid4())[:8]
   114    cluster_id = DEFAULT_TABLE_PREFIX + "-" + str(uuid.uuid4())[:8]
   115    table_id = DEFAULT_TABLE_PREFIX + "-" + str(uuid.uuid4())[:8]
   116    number = 500
   117    LOCATION_ID = "us-east1-b"
   118  
   119    def setUp(self):
   120      try:
   121        from google.cloud.bigtable import enums
   122        self.STORAGE_TYPE = enums.StorageType.HDD
   123        self.INSTANCE_TYPE = enums.Instance.Type.DEVELOPMENT
   124      except ImportError:
   125        self.STORAGE_TYPE = 2
   126        self.INSTANCE_TYPE = 2
   127  
   128      self.test_pipeline = TestPipeline(is_integration_test=True)
   129      self.runner_name = type(self.test_pipeline.runner).__name__
   130      self.project = self.test_pipeline.get_option('project')
   131      self.client = Client(project=self.project, admin=True)
   132  
   133      self._delete_old_instances()
   134  
   135      self.instance = self.client.instance(
   136          self.instance_id, instance_type=self.INSTANCE_TYPE, labels=LABELS)
   137  
   138      if not self.instance.exists():
   139        cluster = self.instance.cluster(
   140            self.cluster_id,
   141            self.LOCATION_ID,
   142            default_storage_type=self.STORAGE_TYPE)
   143        operation = self.instance.create(clusters=[cluster])
   144        operation.result(timeout=300)  # Wait up to 5 min.
   145  
   146      self.table = self.instance.table(self.table_id)
   147  
   148      if not self.table.exists():
   149        max_versions_rule = column_family.MaxVersionsGCRule(2)
   150        column_family_id = 'cf1'
   151        column_families = {column_family_id: max_versions_rule}
   152        self.table.create(column_families=column_families)
   153  
   154    def _delete_old_instances(self):
   155      instances = self.client.list_instances()
   156      EXISTING_INSTANCES[:] = instances
   157  
   158      def age_in_hours(micros):
   159        return (
   160            datetime.datetime.utcnow().replace(tzinfo=UTC) -
   161            (_datetime_from_microseconds(micros))).total_seconds() // 3600
   162  
   163      CLEAN_INSTANCE = [
   164          i for instance in EXISTING_INSTANCES for i in instance if (
   165              LABEL_KEY in i.labels.keys() and
   166              (age_in_hours(int(i.labels[LABEL_KEY])) >= 2))
   167      ]
   168  
   169      if CLEAN_INSTANCE:
   170        for instance in CLEAN_INSTANCE:
   171          instance.delete()
   172  
   173    def tearDown(self):
   174      if self.instance.exists():
   175        self.instance.delete()
   176  
   177    @pytest.mark.it_postcommit
   178    def test_bigtable_write(self):
   179      number = self.number
   180      pipeline_args = self.test_pipeline.options_list
   181      pipeline_options = PipelineOptions(pipeline_args)
   182  
   183      with beam.Pipeline(options=pipeline_options) as pipeline:
   184        config_data = {
   185            'project_id': self.project,
   186            'instance_id': self.instance_id,
   187            'table_id': self.table_id
   188        }
   189        _ = (
   190            pipeline
   191            | 'Generate Direct Rows' >> GenerateTestRows(number, **config_data))
   192  
   193      assert pipeline.result.state == PipelineState.DONE
   194  
   195      read_rows = self.table.read_rows()
   196      assert len([_ for _ in read_rows]) == number
   197  
   198      if not hasattr(pipeline.result, 'has_job') or pipeline.result.has_job:
   199        read_filter = MetricsFilter().with_name('Written Row')
   200        query_result = pipeline.result.metrics().query(read_filter)
   201        if query_result['counters']:
   202          read_counter = query_result['counters'][0]
   203  
   204          logging.info('Number of Rows: %d', read_counter.committed)
   205          assert read_counter.committed == number
   206  
   207  
   208  if __name__ == '__main__':
   209    logging.getLogger().setLevel(logging.INFO)
   210    unittest.main()