github.com/apache/beam/sdks/v2@v2.48.2/go/test/integration/io/xlang/bigquery/bigquery_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 package bigquery 17 18 import ( 19 "flag" 20 "fmt" 21 "log" 22 "math/rand" 23 "reflect" 24 "strings" 25 "testing" 26 "time" 27 28 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 29 30 "github.com/apache/beam/sdks/v2/go/pkg/beam" 31 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/xlang/bigqueryio" 32 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/dataflow" 33 "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" 34 "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/ptest" 35 "github.com/apache/beam/sdks/v2/go/test/integration" 36 ) 37 38 func init() { 39 register.DoFn2x0[[]byte, func(TestRow)](&CreateTestRowsFn{}) 40 register.Emitter1[TestRow]() 41 // TODO(https://github.com/apache/beam/issues/21789): Uncomment once this register no longer panics. 42 //register.Function1x1(castFn) 43 } 44 45 var expansionAddr string // Populate with expansion address labelled "gcpio". 46 47 func checkFlags(t *testing.T) { 48 if *integration.BigQueryDataset == "" { 49 t.Skip("No BigQuery dataset provided.") 50 } 51 } 52 53 const ( 54 // A text to shuffle to get random words. 55 text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas eget nulla nec " + 56 "velit hendrerit placerat. Donec eu odio ultricies, fermentum arcu at, mollis lectus. " + 57 "Vestibulum porttitor pharetra sem vitae feugiat. Mauris facilisis neque in mauris " + 58 "feugiat rhoncus. Donec eu ipsum at nibh lobortis euismod. Nam at hendrerit felis. " + 59 "Vivamus et orci ex. Nam dui nisl, rutrum ac pretium eget, vehicula in tortor. Class " + 60 "aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. " + 61 "Phasellus ante lorem, pharetra blandit dapibus et, tempus nec purus. Maecenas in " + 62 "posuere sem, vel pharetra nisl. Pellentesque habitant morbi tristique senectus et netus " + 63 "et malesuada fames ac turpis egestas. Donec nec facilisis ex. Praesent euismod commodo " + 64 "efficitur. Fusce in nisi nunc." 65 // Number of random elements to create for test. Must be less than number of words in text. 66 inputSize = 50 67 ) 68 69 // TestRow is a sample row to write and read from that is expected to contain enough deterministic 70 // and random data in different data types to provide a reasonable signal that reading and writing 71 // works at a basic level. 72 type TestRow struct { 73 Counter int64 `beam:"counter"` // A deterministic counter, increments for each row generated. 74 RandData RandData `beam:"rand_data"` // An inner struct containing randomized data. 75 } 76 77 func shuffleText() []string { 78 words := strings.Fields(text) 79 rand.Shuffle(len(words), func(i, j int) { words[i], words[j] = words[j], words[i] }) 80 return words 81 } 82 83 // RandData is a struct of various types of random data. 84 type RandData struct { 85 Flip bool `beam:"flip"` // Flip is a bool with a random chance of either result (a coin flip). 86 Num int64 `beam:"num"` // Num is a random int64. 87 Word string `beam:"word"` // Word is a randomly selected word from a sample text. 88 } 89 90 // ddlSchema is a string for BigQuery data definition language that corresponds to TestRow. 91 const ddlTestRowSchema = "counter INT64 NOT NULL, " + 92 "rand_data STRUCT<" + 93 "flip BOOL NOT NULL," + 94 "num INT64 NOT NULL," + 95 "word STRING NOT NULL" + 96 "> NOT NULL" 97 98 // CreateTestRowsFn is a DoFn that creates randomized TestRows based on a seed. 99 type CreateTestRowsFn struct { 100 seed int64 101 } 102 103 // ProcessElement creates a number of TestRows, populating the randomized data. 104 func (fn *CreateTestRowsFn) ProcessElement(_ []byte, emit func(TestRow)) { 105 rand.Seed(fn.seed) 106 words := shuffleText() 107 for i := 0; i < inputSize; i++ { 108 emit(TestRow{ 109 Counter: int64(i), 110 RandData: RandData{ 111 Flip: rand.Int63n(2) != 0, 112 Num: rand.Int63(), 113 Word: words[i], 114 }, 115 }) 116 } 117 } 118 119 // WritePipeline creates a pipeline that writes elements created by createFn into a BigQuery table. 120 func WritePipeline(expansionAddr, table string, createFn any) *beam.Pipeline { 121 p := beam.NewPipeline() 122 s := p.Root() 123 124 // Generate elements and write to table. 125 rows := beam.ParDo(s, createFn, beam.Impulse(s)) 126 bigqueryio.Write(s, table, rows, 127 bigqueryio.CreateDisposition(bigqueryio.CreateNever), 128 bigqueryio.WriteExpansionAddr(expansionAddr)) 129 130 return p 131 } 132 133 // ReadPipeline creates a pipeline that reads elements directly from a BigQuery table and asserts 134 // that they match elements created by createFn. 135 func ReadPipeline(expansionAddr, table string, createFn any) *beam.Pipeline { 136 p := beam.NewPipeline() 137 s := p.Root() 138 139 // Read from table and compare to generated elements. 140 rows := beam.ParDo(s, createFn, beam.Impulse(s)) 141 inType := reflect.TypeOf((*TestRow)(nil)).Elem() 142 readRows := bigqueryio.Read(s, inType, 143 bigqueryio.FromTable(table), 144 bigqueryio.ReadExpansionAddr(expansionAddr)) 145 passert.Equals(s, readRows, rows) 146 147 return p 148 } 149 150 // TestRowPtrs is equivalent to TestRow but all fields are pointers, meant to be used when reading 151 // via query. 152 // 153 // TODO(https://github.com/apache/beam/issues/21784): Change back to a named struct once resolved. 154 type TestRowPtrs = struct { 155 Counter *int64 `beam:"counter"` 156 RandData *RandDataPtrs `beam:"rand_data"` 157 } 158 159 // RandDataPtrs is equivalent to RandData but all fields are pointers, meant to be used when reading 160 // via query. 161 // 162 // TODO(https://github.com/apache/beam/issues/21784): Change back to a named struct once resolved. 163 type RandDataPtrs = struct { 164 Flip *bool `beam:"flip"` 165 Num *int64 `beam:"num"` 166 Word *string `beam:"word"` 167 } 168 169 // castFn converts the result of the query which has pointer fields, into the original TestRow 170 // type that was written to BigQuery. 171 func castFn(elm TestRowPtrs) TestRow { 172 return TestRow{ 173 Counter: *elm.Counter, 174 RandData: RandData{ 175 Flip: *elm.RandData.Flip, 176 Num: *elm.RandData.Num, 177 Word: *elm.RandData.Word, 178 }, 179 } 180 } 181 182 // ReadPipeline creates a pipeline that reads elements from a BigQuery table via a SQL Query, and 183 // asserts that they match elements created by createFn. 184 func ReadFromQueryPipeline(expansionAddr, table string, createFn any) *beam.Pipeline { 185 p := beam.NewPipeline() 186 s := p.Root() 187 188 // Read from table and compare to generated elements. 189 rows := beam.ParDo(s, createFn, beam.Impulse(s)) 190 inType := reflect.TypeOf((*TestRowPtrs)(nil)).Elem() 191 query := fmt.Sprintf("SELECT * FROM `%s`", table) 192 readRows := bigqueryio.Read(s, inType, 193 bigqueryio.FromQuery(query), 194 bigqueryio.ReadExpansionAddr(expansionAddr)) 195 castRows := beam.ParDo(s, castFn, readRows) 196 passert.Equals(s, castRows, rows) 197 198 return p 199 } 200 201 // TestBigQueryIO_BasicWriteRead runs a pipeline that generates semi-randomized elements, writes 202 // them to a BigQuery table and then reads from that table, and checks that the result matches the 203 // original inputs. This requires a pre-existing table to be created. 204 func TestBigQueryIO_BasicWriteRead(t *testing.T) { 205 integration.CheckFilters(t) 206 checkFlags(t) 207 208 // Create a table before running the pipeline 209 table, err := newTempTable(*integration.BigQueryDataset, "go_bqio_it", ddlTestRowSchema) 210 if err != nil { 211 t.Fatalf("error creating BigQuery table: %v", err) 212 } 213 t.Logf("Created BigQuery table %v", table) 214 215 createTestRows := &CreateTestRowsFn{seed: time.Now().UnixNano()} 216 write := WritePipeline(expansionAddr, table, createTestRows) 217 ptest.RunAndValidate(t, write) 218 read := ReadPipeline(expansionAddr, table, createTestRows) 219 ptest.RunAndValidate(t, read) 220 221 t.Logf("Deleting BigQuery table %v", table) 222 err = deleteTempTable(table) 223 if err != nil { 224 t.Logf("Error deleting BigQuery table: %v", err) 225 } 226 } 227 228 // TestBigQueryIO_BasicWriteQueryRead runs a pipeline that generates semi-randomized elements, 229 // writes them to a BigQuery table and then reads from that table, and checks that the result 230 // matches the original inputs. This requires a pre-existing table to be created. 231 // 232 // This test reads from a Bigquery SQL query, instead of directly from a table. 233 func TestBigQueryIO_BasicWriteQueryRead(t *testing.T) { 234 integration.CheckFilters(t) 235 checkFlags(t) 236 237 // Create a table before running the pipeline 238 table, err := newTempTable(*integration.BigQueryDataset, "go_bqio_it", ddlTestRowSchema) 239 if err != nil { 240 t.Fatalf("error creating BigQuery table: %v", err) 241 } 242 t.Logf("Created BigQuery table %v", table) 243 244 createTestRows := &CreateTestRowsFn{seed: time.Now().UnixNano()} 245 write := WritePipeline(expansionAddr, table, createTestRows) 246 ptest.RunAndValidate(t, write) 247 readQuery := ReadFromQueryPipeline(expansionAddr, table, createTestRows) 248 ptest.RunAndValidate(t, readQuery) 249 250 t.Logf("Deleting BigQuery table %v", table) 251 err = deleteTempTable(table) 252 if err != nil { 253 t.Logf("Error deleting BigQuery table: %v", err) 254 } 255 } 256 257 func TestMain(m *testing.M) { 258 flag.Parse() 259 beam.Init() 260 261 services := integration.NewExpansionServices() 262 defer func() { services.Shutdown() }() 263 addr, err := services.GetAddr("gcpio") 264 if err != nil { 265 log.Printf("skipping missing expansion service: %v", err) 266 } else { 267 expansionAddr = addr 268 } 269 270 ptest.MainRet(m) 271 }