github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/cache_manager_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # pytype: skip-file
    19  
    20  import time
    21  import unittest
    22  
    23  import apache_beam as beam
    24  from apache_beam import coders
    25  from apache_beam.runners.interactive import cache_manager as cache
    26  from apache_beam.runners.interactive import interactive_beam as ib
    27  
    28  
    29  class FileBasedCacheManagerTest(object):
    30    """Unit test for FileBasedCacheManager.
    31  
    32    Note that this set of tests focuses only the methods that interacts with
    33    the LOCAL file system. The idea is that once FileBasedCacheManager works well
    34    with the local file system, it should work with any file system with
    35    `apache_beam.io.filesystem` interface. Those tests that involve interactions
    36    with Beam pipeline (i.e. source(), sink(), ReadCache, and WriteCache) will be
    37    tested with InteractiveRunner as a part of integration tests instead.
    38    """
    39  
    40    cache_format = None  # type: str
    41  
    42    def setUp(self):
    43      self.cache_manager = cache.FileBasedCacheManager(
    44          cache_format=self.cache_format)
    45  
    46    def tearDown(self):
    47      self.cache_manager.cleanup()
    48  
    49    def mock_write_cache(self, values, prefix, cache_label):
    50      """Cache the PCollection where cache.WriteCache would write to."""
    51      # Pause for 0.1 sec, because the Jenkins test runs so fast that the file
    52      # writes happen at the same timestamp.
    53      time.sleep(0.1)
    54  
    55      labels = [prefix, cache_label]
    56      self.cache_manager.write(values, *labels)
    57  
    58    def test_exists(self):
    59      """Test that CacheManager can correctly tell if the cache exists or not."""
    60      prefix = 'full'
    61      cache_label = 'some-cache-label'
    62      cache_version_one = ['cache', 'version', 'one']
    63  
    64      self.assertFalse(self.cache_manager.exists(prefix, cache_label))
    65      self.mock_write_cache(cache_version_one, prefix, cache_label)
    66      self.assertTrue(self.cache_manager.exists(prefix, cache_label))
    67      self.cache_manager.cleanup()
    68      self.assertFalse(self.cache_manager.exists(prefix, cache_label))
    69      self.mock_write_cache(cache_version_one, prefix, cache_label)
    70      self.assertTrue(self.cache_manager.exists(prefix, cache_label))
    71  
    72    def test_empty_label_not_exist(self):
    73      prefix = 'full'
    74      cache_label = 'some-cache-label'
    75      cache_version_one = ['cache', 'version', 'one']
    76  
    77      self.assertFalse(self.cache_manager.exists(prefix, cache_label))
    78      self.mock_write_cache(cache_version_one, prefix, cache_label)
    79      self.assertTrue(self.cache_manager.exists(prefix, cache_label))
    80  
    81      # '' shouldn't be treated as a wildcard to match everything.
    82      self.assertFalse(self.cache_manager.exists(prefix, ''))
    83  
    84    def test_size(self):
    85      """Test getting the size of some cache label."""
    86  
    87      # The Beam API for writing doesn't return the number of bytes that was
    88      # written to disk. So this test is only possible when the coder encodes the
    89      # bytes that will be written directly to disk, which only the WriteToText
    90      # transform does (with respect to the WriteToTFRecord transform).
    91      if self.cache_manager.cache_format != 'text':
    92        return
    93  
    94      prefix = 'full'
    95      cache_label = 'some-cache-label'
    96  
    97      # Test that if nothing is written the size is 0.
    98      self.assertEqual(self.cache_manager.size(prefix, cache_label), 0)
    99  
   100      value = 'a'
   101      self.mock_write_cache([value], prefix, cache_label)
   102      coder = self.cache_manager.load_pcoder(prefix, cache_label)
   103      encoded = coder.encode(value)
   104  
   105      # Add one to the size on disk because of the extra new-line character when
   106      # writing to file.
   107      self.assertEqual(
   108          self.cache_manager.size(prefix, cache_label), len(encoded) + 1)
   109  
   110    def test_clear(self):
   111      """Test that CacheManager can correctly tell if the cache exists or not."""
   112      prefix = 'full'
   113      cache_label = 'some-cache-label'
   114      cache_version_one = ['cache', 'version', 'one']
   115  
   116      self.assertFalse(self.cache_manager.exists(prefix, cache_label))
   117      self.mock_write_cache(cache_version_one, prefix, cache_label)
   118      self.assertTrue(self.cache_manager.exists(prefix, cache_label))
   119      self.assertTrue(self.cache_manager.clear(prefix, cache_label))
   120      self.assertFalse(self.cache_manager.exists(prefix, cache_label))
   121  
   122    def test_read_basic(self):
   123      """Test the condition where the cache is read once after written once."""
   124      prefix = 'full'
   125      cache_label = 'some-cache-label'
   126      cache_version_one = ['cache', 'version', 'one']
   127  
   128      self.mock_write_cache(cache_version_one, prefix, cache_label)
   129      reader, version = self.cache_manager.read(prefix, cache_label)
   130      pcoll_list = list(reader)
   131      self.assertListEqual(pcoll_list, cache_version_one)
   132      self.assertEqual(version, 0)
   133      self.assertTrue(
   134          self.cache_manager.is_latest_version(version, prefix, cache_label))
   135  
   136    def test_read_version_update(self):
   137      """Tests if the version is properly updated after the files are updated."""
   138      prefix = 'full'
   139      cache_label = 'some-cache-label'
   140      cache_version_one = ['cache', 'version', 'one']
   141      cache_version_two = ['cache', 'version', 'two']
   142  
   143      self.mock_write_cache(cache_version_one, prefix, cache_label)
   144      reader, version = self.cache_manager.read(prefix, cache_label)
   145      pcoll_list = list(reader)
   146  
   147      self.mock_write_cache(cache_version_two, prefix, cache_label)
   148      self.assertFalse(
   149          self.cache_manager.is_latest_version(version, prefix, cache_label))
   150  
   151      reader, version = self.cache_manager.read(prefix, cache_label)
   152      pcoll_list = list(reader)
   153      self.assertListEqual(pcoll_list, cache_version_two)
   154      self.assertEqual(version, 1)
   155      self.assertTrue(
   156          self.cache_manager.is_latest_version(version, prefix, cache_label))
   157  
   158    def test_read_before_write(self):
   159      """Test the behavior when read() is called before WriteCache completes."""
   160      prefix = 'full'
   161      cache_label = 'some-cache-label'
   162  
   163      self.assertFalse(self.cache_manager.exists(prefix, cache_label))
   164  
   165      reader, version = self.cache_manager.read(prefix, cache_label)
   166      pcoll_list = list(reader)
   167      self.assertListEqual(pcoll_list, [])
   168      self.assertEqual(version, -1)
   169      self.assertTrue(
   170          self.cache_manager.is_latest_version(version, prefix, cache_label))
   171  
   172    def test_read_over_cleanup(self):
   173      """Test the behavior of read() over cache cleanup."""
   174      prefix = 'full'
   175      cache_label = 'some-cache-label'
   176      cache_version_one = ['cache', 'version', 'one']
   177      cache_version_two = ['cache', 'version', 'two']
   178  
   179      # The initial write and read.
   180      self.mock_write_cache(cache_version_one, prefix, cache_label)
   181      reader, version = self.cache_manager.read(prefix, cache_label)
   182      pcoll_list = list(reader)
   183  
   184      # Cache cleanup.
   185      self.cache_manager.cleanup()
   186      # Check that even if cache is evicted, the latest version stays the same.
   187      self.assertTrue(
   188          self.cache_manager.is_latest_version(version, prefix, cache_label))
   189  
   190      reader, version = self.cache_manager.read(prefix, cache_label)
   191      pcoll_list = list(reader)
   192      self.assertListEqual(pcoll_list, [])
   193      self.assertEqual(version, -1)
   194      self.assertFalse(
   195          self.cache_manager.is_latest_version(version, prefix, cache_label))
   196  
   197      # PCollection brought back to cache.
   198      self.mock_write_cache(cache_version_two, prefix, cache_label)
   199      self.assertFalse(
   200          self.cache_manager.is_latest_version(version, prefix, cache_label))
   201  
   202      reader, version = self.cache_manager.read(prefix, cache_label)
   203      pcoll_list = list(reader)
   204      self.assertListEqual(pcoll_list, cache_version_two)
   205      # Check that version continues from the previous value instead of starting
   206      # from 0 again.
   207      self.assertEqual(version, 1)
   208      self.assertTrue(
   209          self.cache_manager.is_latest_version(version, prefix, cache_label))
   210  
   211    def test_load_saved_pcoder(self):
   212      pipeline = beam.Pipeline()
   213      pcoll = pipeline | beam.Create([1, 2, 3])
   214      _ = pcoll | cache.WriteCache(self.cache_manager, 'a key')
   215      self.assertIs(
   216          type(self.cache_manager.load_pcoder('full', 'a key')),
   217          type(coders.registry.get_coder(int)))
   218  
   219    def test_cache_manager_uses_gcs_ib_cache_root(self):
   220      """
   221      Checks that FileBasedCacheManager._cache_dir is set to the
   222      cache_root set under Interactive Beam for a GCS directory.
   223      """
   224      # Set Interactive Beam specified cache dir to cloud storage
   225      ib.options.cache_root = 'gs://'
   226  
   227      cache_manager_with_ib_option = cache.FileBasedCacheManager(
   228          cache_dir=ib.options.cache_root)
   229  
   230      self.assertEqual(
   231          ib.options.cache_root, cache_manager_with_ib_option._cache_dir)
   232  
   233      # Reset Interactive Beam setting
   234      ib.options.cache_root = None
   235  
   236    def test_cache_manager_uses_local_ib_cache_root(self):
   237      """
   238      Checks that FileBasedCacheManager._cache_dir is set to the
   239      cache_root set under Interactive Beam for a local directory
   240      and that the cached values are the same as the values of a
   241      cache using default settings.
   242      """
   243      prefix = 'full'
   244      cache_label = 'some-cache-label'
   245      cached_values = [1, 2, 3]
   246  
   247      self.mock_write_cache(cached_values, prefix, cache_label)
   248      reader_one, _ = self.cache_manager.read(prefix, cache_label)
   249      pcoll_list_one = list(reader_one)
   250  
   251      # Set Interactive Beam specified cache dir to local directory
   252      ib.options.cache_root = '/tmp/it-test/'
   253      cache_manager_with_ib_option = cache.FileBasedCacheManager(
   254          cache_dir=ib.options.cache_root)
   255      self.assertEqual(
   256          ib.options.cache_root, cache_manager_with_ib_option._cache_dir)
   257  
   258      cache_manager_with_ib_option.write(cached_values, *[prefix, cache_label])
   259      reader_two, _ = self.cache_manager.read(prefix, cache_label)
   260      pcoll_list_two = list(reader_two)
   261  
   262      # Writing to a different directory should not impact the cached values
   263      self.assertEqual(pcoll_list_one, pcoll_list_two)
   264  
   265      # Reset Interactive Beam setting
   266      ib.options.cache_root = None
   267  
   268  
   269  class TextFileBasedCacheManagerTest(
   270      FileBasedCacheManagerTest,
   271      unittest.TestCase,
   272  ):
   273  
   274    cache_format = 'text'
   275  
   276  
   277  class TFRecordBasedCacheManagerTest(
   278      FileBasedCacheManagerTest,
   279      unittest.TestCase,
   280  ):
   281  
   282    cache_format = 'tfrecord'
   283  
   284  
   285  if __name__ == '__main__':
   286    unittest.main()