github.com/apache/beam/sdks/v2@v2.48.2/go/test/integration/io/xlang/bigquery/bigquery_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  package bigquery
    17  
    18  import (
    19  	"flag"
    20  	"fmt"
    21  	"log"
    22  	"math/rand"
    23  	"reflect"
    24  	"strings"
    25  	"testing"
    26  	"time"
    27  
    28  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    29  
    30  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    31  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/xlang/bigqueryio"
    32  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/dataflow"
    33  	"github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert"
    34  	"github.com/apache/beam/sdks/v2/go/pkg/beam/testing/ptest"
    35  	"github.com/apache/beam/sdks/v2/go/test/integration"
    36  )
    37  
    38  func init() {
    39  	register.DoFn2x0[[]byte, func(TestRow)](&CreateTestRowsFn{})
    40  	register.Emitter1[TestRow]()
    41  	// TODO(https://github.com/apache/beam/issues/21789): Uncomment once this register no longer panics.
    42  	//register.Function1x1(castFn)
    43  }
    44  
    45  var expansionAddr string // Populate with expansion address labelled "gcpio".
    46  
    47  func checkFlags(t *testing.T) {
    48  	if *integration.BigQueryDataset == "" {
    49  		t.Skip("No BigQuery dataset provided.")
    50  	}
    51  }
    52  
    53  const (
    54  	// A text to shuffle to get random words.
    55  	text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas eget nulla nec " +
    56  		"velit hendrerit placerat. Donec eu odio ultricies, fermentum arcu at, mollis lectus. " +
    57  		"Vestibulum porttitor pharetra sem vitae feugiat. Mauris facilisis neque in mauris " +
    58  		"feugiat rhoncus. Donec eu ipsum at nibh lobortis euismod. Nam at hendrerit felis. " +
    59  		"Vivamus et orci ex. Nam dui nisl, rutrum ac pretium eget, vehicula in tortor. Class " +
    60  		"aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. " +
    61  		"Phasellus ante lorem, pharetra blandit dapibus et, tempus nec purus. Maecenas in " +
    62  		"posuere sem, vel pharetra nisl. Pellentesque habitant morbi tristique senectus et netus " +
    63  		"et malesuada fames ac turpis egestas. Donec nec facilisis ex. Praesent euismod commodo " +
    64  		"efficitur. Fusce in nisi nunc."
    65  	// Number of random elements to create for test. Must be less than number of words in text.
    66  	inputSize = 50
    67  )
    68  
    69  // TestRow is a sample row to write and read from that is expected to contain enough deterministic
    70  // and random data in different data types to provide a reasonable signal that reading and writing
    71  // works at a basic level.
    72  type TestRow struct {
    73  	Counter  int64    `beam:"counter"`   // A deterministic counter, increments for each row generated.
    74  	RandData RandData `beam:"rand_data"` // An inner struct containing randomized data.
    75  }
    76  
    77  func shuffleText() []string {
    78  	words := strings.Fields(text)
    79  	rand.Shuffle(len(words), func(i, j int) { words[i], words[j] = words[j], words[i] })
    80  	return words
    81  }
    82  
    83  // RandData is a struct of various types of random data.
    84  type RandData struct {
    85  	Flip bool   `beam:"flip"` // Flip is a bool with a random chance of either result (a coin flip).
    86  	Num  int64  `beam:"num"`  // Num is a random int64.
    87  	Word string `beam:"word"` // Word is a randomly selected word from a sample text.
    88  }
    89  
    90  // ddlSchema is a string for BigQuery data definition language that corresponds to TestRow.
    91  const ddlTestRowSchema = "counter INT64 NOT NULL, " +
    92  	"rand_data STRUCT<" +
    93  	"flip BOOL NOT NULL," +
    94  	"num INT64 NOT NULL," +
    95  	"word STRING NOT NULL" +
    96  	"> NOT NULL"
    97  
    98  // CreateTestRowsFn is a DoFn that creates randomized TestRows based on a seed.
    99  type CreateTestRowsFn struct {
   100  	seed int64
   101  }
   102  
   103  // ProcessElement creates a number of TestRows, populating the randomized data.
   104  func (fn *CreateTestRowsFn) ProcessElement(_ []byte, emit func(TestRow)) {
   105  	rand.Seed(fn.seed)
   106  	words := shuffleText()
   107  	for i := 0; i < inputSize; i++ {
   108  		emit(TestRow{
   109  			Counter: int64(i),
   110  			RandData: RandData{
   111  				Flip: rand.Int63n(2) != 0,
   112  				Num:  rand.Int63(),
   113  				Word: words[i],
   114  			},
   115  		})
   116  	}
   117  }
   118  
   119  // WritePipeline creates a pipeline that writes elements created by createFn into a BigQuery table.
   120  func WritePipeline(expansionAddr, table string, createFn any) *beam.Pipeline {
   121  	p := beam.NewPipeline()
   122  	s := p.Root()
   123  
   124  	// Generate elements and write to table.
   125  	rows := beam.ParDo(s, createFn, beam.Impulse(s))
   126  	bigqueryio.Write(s, table, rows,
   127  		bigqueryio.CreateDisposition(bigqueryio.CreateNever),
   128  		bigqueryio.WriteExpansionAddr(expansionAddr))
   129  
   130  	return p
   131  }
   132  
   133  // ReadPipeline creates a pipeline that reads elements directly from a BigQuery table and asserts
   134  // that they match elements created by createFn.
   135  func ReadPipeline(expansionAddr, table string, createFn any) *beam.Pipeline {
   136  	p := beam.NewPipeline()
   137  	s := p.Root()
   138  
   139  	// Read from table and compare to generated elements.
   140  	rows := beam.ParDo(s, createFn, beam.Impulse(s))
   141  	inType := reflect.TypeOf((*TestRow)(nil)).Elem()
   142  	readRows := bigqueryio.Read(s, inType,
   143  		bigqueryio.FromTable(table),
   144  		bigqueryio.ReadExpansionAddr(expansionAddr))
   145  	passert.Equals(s, readRows, rows)
   146  
   147  	return p
   148  }
   149  
   150  // TestRowPtrs is equivalent to TestRow but all fields are pointers, meant to be used when reading
   151  // via query.
   152  //
   153  // TODO(https://github.com/apache/beam/issues/21784): Change back to a named struct once resolved.
   154  type TestRowPtrs = struct {
   155  	Counter  *int64        `beam:"counter"`
   156  	RandData *RandDataPtrs `beam:"rand_data"`
   157  }
   158  
   159  // RandDataPtrs is equivalent to RandData but all fields are pointers, meant to be used when reading
   160  // via query.
   161  //
   162  // TODO(https://github.com/apache/beam/issues/21784): Change back to a named struct once resolved.
   163  type RandDataPtrs = struct {
   164  	Flip *bool   `beam:"flip"`
   165  	Num  *int64  `beam:"num"`
   166  	Word *string `beam:"word"`
   167  }
   168  
   169  // castFn converts the result of the query which has pointer fields, into the original TestRow
   170  // type that was written to BigQuery.
   171  func castFn(elm TestRowPtrs) TestRow {
   172  	return TestRow{
   173  		Counter: *elm.Counter,
   174  		RandData: RandData{
   175  			Flip: *elm.RandData.Flip,
   176  			Num:  *elm.RandData.Num,
   177  			Word: *elm.RandData.Word,
   178  		},
   179  	}
   180  }
   181  
   182  // ReadPipeline creates a pipeline that reads elements from a BigQuery table via a SQL Query, and
   183  // asserts that they match elements created by createFn.
   184  func ReadFromQueryPipeline(expansionAddr, table string, createFn any) *beam.Pipeline {
   185  	p := beam.NewPipeline()
   186  	s := p.Root()
   187  
   188  	// Read from table and compare to generated elements.
   189  	rows := beam.ParDo(s, createFn, beam.Impulse(s))
   190  	inType := reflect.TypeOf((*TestRowPtrs)(nil)).Elem()
   191  	query := fmt.Sprintf("SELECT * FROM `%s`", table)
   192  	readRows := bigqueryio.Read(s, inType,
   193  		bigqueryio.FromQuery(query),
   194  		bigqueryio.ReadExpansionAddr(expansionAddr))
   195  	castRows := beam.ParDo(s, castFn, readRows)
   196  	passert.Equals(s, castRows, rows)
   197  
   198  	return p
   199  }
   200  
   201  // TestBigQueryIO_BasicWriteRead runs a pipeline that generates semi-randomized elements, writes
   202  // them to a BigQuery table and then reads from that table, and checks that the result matches the
   203  // original inputs. This requires a pre-existing table to be created.
   204  func TestBigQueryIO_BasicWriteRead(t *testing.T) {
   205  	integration.CheckFilters(t)
   206  	checkFlags(t)
   207  
   208  	// Create a table before running the pipeline
   209  	table, err := newTempTable(*integration.BigQueryDataset, "go_bqio_it", ddlTestRowSchema)
   210  	if err != nil {
   211  		t.Fatalf("error creating BigQuery table: %v", err)
   212  	}
   213  	t.Logf("Created BigQuery table %v", table)
   214  
   215  	createTestRows := &CreateTestRowsFn{seed: time.Now().UnixNano()}
   216  	write := WritePipeline(expansionAddr, table, createTestRows)
   217  	ptest.RunAndValidate(t, write)
   218  	read := ReadPipeline(expansionAddr, table, createTestRows)
   219  	ptest.RunAndValidate(t, read)
   220  
   221  	t.Logf("Deleting BigQuery table %v", table)
   222  	err = deleteTempTable(table)
   223  	if err != nil {
   224  		t.Logf("Error deleting BigQuery table: %v", err)
   225  	}
   226  }
   227  
   228  // TestBigQueryIO_BasicWriteQueryRead runs a pipeline that generates semi-randomized elements,
   229  // writes them to a BigQuery table and then reads from that table, and checks that the result
   230  // matches the original inputs. This requires a pre-existing table to be created.
   231  //
   232  // This test reads from a Bigquery SQL query, instead of directly from a table.
   233  func TestBigQueryIO_BasicWriteQueryRead(t *testing.T) {
   234  	integration.CheckFilters(t)
   235  	checkFlags(t)
   236  
   237  	// Create a table before running the pipeline
   238  	table, err := newTempTable(*integration.BigQueryDataset, "go_bqio_it", ddlTestRowSchema)
   239  	if err != nil {
   240  		t.Fatalf("error creating BigQuery table: %v", err)
   241  	}
   242  	t.Logf("Created BigQuery table %v", table)
   243  
   244  	createTestRows := &CreateTestRowsFn{seed: time.Now().UnixNano()}
   245  	write := WritePipeline(expansionAddr, table, createTestRows)
   246  	ptest.RunAndValidate(t, write)
   247  	readQuery := ReadFromQueryPipeline(expansionAddr, table, createTestRows)
   248  	ptest.RunAndValidate(t, readQuery)
   249  
   250  	t.Logf("Deleting BigQuery table %v", table)
   251  	err = deleteTempTable(table)
   252  	if err != nil {
   253  		t.Logf("Error deleting BigQuery table: %v", err)
   254  	}
   255  }
   256  
   257  func TestMain(m *testing.M) {
   258  	flag.Parse()
   259  	beam.Init()
   260  
   261  	services := integration.NewExpansionServices()
   262  	defer func() { services.Shutdown() }()
   263  	addr, err := services.GetAddr("gcpio")
   264  	if err != nil {
   265  		log.Printf("skipping missing expansion service: %v", err)
   266  	} else {
   267  		expansionAddr = addr
   268  	}
   269  
   270  	ptest.MainRet(m)
   271  }