github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/cache_manager_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # pytype: skip-file 19 20 import time 21 import unittest 22 23 import apache_beam as beam 24 from apache_beam import coders 25 from apache_beam.runners.interactive import cache_manager as cache 26 from apache_beam.runners.interactive import interactive_beam as ib 27 28 29 class FileBasedCacheManagerTest(object): 30 """Unit test for FileBasedCacheManager. 31 32 Note that this set of tests focuses only the methods that interacts with 33 the LOCAL file system. The idea is that once FileBasedCacheManager works well 34 with the local file system, it should work with any file system with 35 `apache_beam.io.filesystem` interface. Those tests that involve interactions 36 with Beam pipeline (i.e. source(), sink(), ReadCache, and WriteCache) will be 37 tested with InteractiveRunner as a part of integration tests instead. 38 """ 39 40 cache_format = None # type: str 41 42 def setUp(self): 43 self.cache_manager = cache.FileBasedCacheManager( 44 cache_format=self.cache_format) 45 46 def tearDown(self): 47 self.cache_manager.cleanup() 48 49 def mock_write_cache(self, values, prefix, cache_label): 50 """Cache the PCollection where cache.WriteCache would write to.""" 51 # Pause for 0.1 sec, because the Jenkins test runs so fast that the file 52 # writes happen at the same timestamp. 53 time.sleep(0.1) 54 55 labels = [prefix, cache_label] 56 self.cache_manager.write(values, *labels) 57 58 def test_exists(self): 59 """Test that CacheManager can correctly tell if the cache exists or not.""" 60 prefix = 'full' 61 cache_label = 'some-cache-label' 62 cache_version_one = ['cache', 'version', 'one'] 63 64 self.assertFalse(self.cache_manager.exists(prefix, cache_label)) 65 self.mock_write_cache(cache_version_one, prefix, cache_label) 66 self.assertTrue(self.cache_manager.exists(prefix, cache_label)) 67 self.cache_manager.cleanup() 68 self.assertFalse(self.cache_manager.exists(prefix, cache_label)) 69 self.mock_write_cache(cache_version_one, prefix, cache_label) 70 self.assertTrue(self.cache_manager.exists(prefix, cache_label)) 71 72 def test_empty_label_not_exist(self): 73 prefix = 'full' 74 cache_label = 'some-cache-label' 75 cache_version_one = ['cache', 'version', 'one'] 76 77 self.assertFalse(self.cache_manager.exists(prefix, cache_label)) 78 self.mock_write_cache(cache_version_one, prefix, cache_label) 79 self.assertTrue(self.cache_manager.exists(prefix, cache_label)) 80 81 # '' shouldn't be treated as a wildcard to match everything. 82 self.assertFalse(self.cache_manager.exists(prefix, '')) 83 84 def test_size(self): 85 """Test getting the size of some cache label.""" 86 87 # The Beam API for writing doesn't return the number of bytes that was 88 # written to disk. So this test is only possible when the coder encodes the 89 # bytes that will be written directly to disk, which only the WriteToText 90 # transform does (with respect to the WriteToTFRecord transform). 91 if self.cache_manager.cache_format != 'text': 92 return 93 94 prefix = 'full' 95 cache_label = 'some-cache-label' 96 97 # Test that if nothing is written the size is 0. 98 self.assertEqual(self.cache_manager.size(prefix, cache_label), 0) 99 100 value = 'a' 101 self.mock_write_cache([value], prefix, cache_label) 102 coder = self.cache_manager.load_pcoder(prefix, cache_label) 103 encoded = coder.encode(value) 104 105 # Add one to the size on disk because of the extra new-line character when 106 # writing to file. 107 self.assertEqual( 108 self.cache_manager.size(prefix, cache_label), len(encoded) + 1) 109 110 def test_clear(self): 111 """Test that CacheManager can correctly tell if the cache exists or not.""" 112 prefix = 'full' 113 cache_label = 'some-cache-label' 114 cache_version_one = ['cache', 'version', 'one'] 115 116 self.assertFalse(self.cache_manager.exists(prefix, cache_label)) 117 self.mock_write_cache(cache_version_one, prefix, cache_label) 118 self.assertTrue(self.cache_manager.exists(prefix, cache_label)) 119 self.assertTrue(self.cache_manager.clear(prefix, cache_label)) 120 self.assertFalse(self.cache_manager.exists(prefix, cache_label)) 121 122 def test_read_basic(self): 123 """Test the condition where the cache is read once after written once.""" 124 prefix = 'full' 125 cache_label = 'some-cache-label' 126 cache_version_one = ['cache', 'version', 'one'] 127 128 self.mock_write_cache(cache_version_one, prefix, cache_label) 129 reader, version = self.cache_manager.read(prefix, cache_label) 130 pcoll_list = list(reader) 131 self.assertListEqual(pcoll_list, cache_version_one) 132 self.assertEqual(version, 0) 133 self.assertTrue( 134 self.cache_manager.is_latest_version(version, prefix, cache_label)) 135 136 def test_read_version_update(self): 137 """Tests if the version is properly updated after the files are updated.""" 138 prefix = 'full' 139 cache_label = 'some-cache-label' 140 cache_version_one = ['cache', 'version', 'one'] 141 cache_version_two = ['cache', 'version', 'two'] 142 143 self.mock_write_cache(cache_version_one, prefix, cache_label) 144 reader, version = self.cache_manager.read(prefix, cache_label) 145 pcoll_list = list(reader) 146 147 self.mock_write_cache(cache_version_two, prefix, cache_label) 148 self.assertFalse( 149 self.cache_manager.is_latest_version(version, prefix, cache_label)) 150 151 reader, version = self.cache_manager.read(prefix, cache_label) 152 pcoll_list = list(reader) 153 self.assertListEqual(pcoll_list, cache_version_two) 154 self.assertEqual(version, 1) 155 self.assertTrue( 156 self.cache_manager.is_latest_version(version, prefix, cache_label)) 157 158 def test_read_before_write(self): 159 """Test the behavior when read() is called before WriteCache completes.""" 160 prefix = 'full' 161 cache_label = 'some-cache-label' 162 163 self.assertFalse(self.cache_manager.exists(prefix, cache_label)) 164 165 reader, version = self.cache_manager.read(prefix, cache_label) 166 pcoll_list = list(reader) 167 self.assertListEqual(pcoll_list, []) 168 self.assertEqual(version, -1) 169 self.assertTrue( 170 self.cache_manager.is_latest_version(version, prefix, cache_label)) 171 172 def test_read_over_cleanup(self): 173 """Test the behavior of read() over cache cleanup.""" 174 prefix = 'full' 175 cache_label = 'some-cache-label' 176 cache_version_one = ['cache', 'version', 'one'] 177 cache_version_two = ['cache', 'version', 'two'] 178 179 # The initial write and read. 180 self.mock_write_cache(cache_version_one, prefix, cache_label) 181 reader, version = self.cache_manager.read(prefix, cache_label) 182 pcoll_list = list(reader) 183 184 # Cache cleanup. 185 self.cache_manager.cleanup() 186 # Check that even if cache is evicted, the latest version stays the same. 187 self.assertTrue( 188 self.cache_manager.is_latest_version(version, prefix, cache_label)) 189 190 reader, version = self.cache_manager.read(prefix, cache_label) 191 pcoll_list = list(reader) 192 self.assertListEqual(pcoll_list, []) 193 self.assertEqual(version, -1) 194 self.assertFalse( 195 self.cache_manager.is_latest_version(version, prefix, cache_label)) 196 197 # PCollection brought back to cache. 198 self.mock_write_cache(cache_version_two, prefix, cache_label) 199 self.assertFalse( 200 self.cache_manager.is_latest_version(version, prefix, cache_label)) 201 202 reader, version = self.cache_manager.read(prefix, cache_label) 203 pcoll_list = list(reader) 204 self.assertListEqual(pcoll_list, cache_version_two) 205 # Check that version continues from the previous value instead of starting 206 # from 0 again. 207 self.assertEqual(version, 1) 208 self.assertTrue( 209 self.cache_manager.is_latest_version(version, prefix, cache_label)) 210 211 def test_load_saved_pcoder(self): 212 pipeline = beam.Pipeline() 213 pcoll = pipeline | beam.Create([1, 2, 3]) 214 _ = pcoll | cache.WriteCache(self.cache_manager, 'a key') 215 self.assertIs( 216 type(self.cache_manager.load_pcoder('full', 'a key')), 217 type(coders.registry.get_coder(int))) 218 219 def test_cache_manager_uses_gcs_ib_cache_root(self): 220 """ 221 Checks that FileBasedCacheManager._cache_dir is set to the 222 cache_root set under Interactive Beam for a GCS directory. 223 """ 224 # Set Interactive Beam specified cache dir to cloud storage 225 ib.options.cache_root = 'gs://' 226 227 cache_manager_with_ib_option = cache.FileBasedCacheManager( 228 cache_dir=ib.options.cache_root) 229 230 self.assertEqual( 231 ib.options.cache_root, cache_manager_with_ib_option._cache_dir) 232 233 # Reset Interactive Beam setting 234 ib.options.cache_root = None 235 236 def test_cache_manager_uses_local_ib_cache_root(self): 237 """ 238 Checks that FileBasedCacheManager._cache_dir is set to the 239 cache_root set under Interactive Beam for a local directory 240 and that the cached values are the same as the values of a 241 cache using default settings. 242 """ 243 prefix = 'full' 244 cache_label = 'some-cache-label' 245 cached_values = [1, 2, 3] 246 247 self.mock_write_cache(cached_values, prefix, cache_label) 248 reader_one, _ = self.cache_manager.read(prefix, cache_label) 249 pcoll_list_one = list(reader_one) 250 251 # Set Interactive Beam specified cache dir to local directory 252 ib.options.cache_root = '/tmp/it-test/' 253 cache_manager_with_ib_option = cache.FileBasedCacheManager( 254 cache_dir=ib.options.cache_root) 255 self.assertEqual( 256 ib.options.cache_root, cache_manager_with_ib_option._cache_dir) 257 258 cache_manager_with_ib_option.write(cached_values, *[prefix, cache_label]) 259 reader_two, _ = self.cache_manager.read(prefix, cache_label) 260 pcoll_list_two = list(reader_two) 261 262 # Writing to a different directory should not impact the cached values 263 self.assertEqual(pcoll_list_one, pcoll_list_two) 264 265 # Reset Interactive Beam setting 266 ib.options.cache_root = None 267 268 269 class TextFileBasedCacheManagerTest( 270 FileBasedCacheManagerTest, 271 unittest.TestCase, 272 ): 273 274 cache_format = 'text' 275 276 277 class TFRecordBasedCacheManagerTest( 278 FileBasedCacheManagerTest, 279 unittest.TestCase, 280 ): 281 282 cache_format = 'tfrecord' 283 284 285 if __name__ == '__main__': 286 unittest.main()