kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/pipeline/beamio/entries.go (about)

     1  /*
     2   * Copyright 2018 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package beamio
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"io"
    23  	"strings"
    24  
    25  	"kythe.io/kythe/go/storage/stream"
    26  	"kythe.io/kythe/go/util/riegeli"
    27  
    28  	"github.com/apache/beam/sdks/go/pkg/beam"
    29  	"github.com/apache/beam/sdks/go/pkg/beam/io/filesystem"
    30  
    31  	spb "kythe.io/kythe/proto/storage_go_proto"
    32  )
    33  
    34  func init() {
    35  	beam.RegisterFunction(readFile)
    36  }
    37  
    38  // ReadEntries reads a set of *spb.Entry messages into a PCollection from the
    39  // given file, or files stored in a directory.  The file can be part of any
    40  // filesystem registered with the beam/io/filesystem package and can either be a
    41  // delimited protobuf stream or a Riegeli file.
    42  func ReadEntries(ctx context.Context, s beam.Scope, fileOrDir string) (beam.PCollection, error) {
    43  	if strings.HasSuffix(fileOrDir, "/") {
    44  		var errv beam.PCollection
    45  		fs, err := filesystem.New(ctx, fileOrDir)
    46  		if err != nil {
    47  			return errv, err
    48  		}
    49  		defer fs.Close()
    50  		files, err := fs.List(ctx, fileOrDir+"*")
    51  		if err != nil {
    52  			return errv, err
    53  		}
    54  		if len(files) == 0 {
    55  			return errv, fmt.Errorf("no entries found in %s - maybe mistyped path?", fileOrDir)
    56  		}
    57  		return beam.ParDo(s, readFile, beam.CreateList(s, files)), nil
    58  	}
    59  	return beam.ParDo(s, readFile, beam.Create(s, fileOrDir)), nil
    60  }
    61  
    62  func readFile(ctx context.Context, file string, emit func(*spb.Entry)) error {
    63  	if isRiegeli(ctx, file) {
    64  		return readRiegeli(ctx, file, emit)
    65  	}
    66  	return readStream(ctx, file, emit)
    67  }
    68  
    69  func isRiegeli(ctx context.Context, file string) bool {
    70  	fs, err := filesystem.New(ctx, file)
    71  	if err != nil {
    72  		return false
    73  	}
    74  	defer fs.Close()
    75  	f, err := fs.OpenRead(ctx, file)
    76  	if err != nil {
    77  		return false
    78  	}
    79  	defer f.Close()
    80  	rd := riegeli.NewReader(f)
    81  	if _, err := rd.RecordsMetadata(); err != nil {
    82  		return false
    83  	}
    84  	return true
    85  }
    86  
    87  func readStream(ctx context.Context, filename string, emit func(*spb.Entry)) error {
    88  	fs, err := filesystem.New(ctx, filename)
    89  	if err != nil {
    90  		return err
    91  	}
    92  	f, err := fs.OpenRead(ctx, filename)
    93  	if err != nil {
    94  		return err
    95  	}
    96  	defer f.Close()
    97  	for e := range stream.ReadEntries(f) {
    98  		emit(e)
    99  	}
   100  	return fs.Close()
   101  }
   102  
   103  func readRiegeli(ctx context.Context, filename string, emit func(*spb.Entry)) error {
   104  	fs, err := filesystem.New(ctx, filename)
   105  	if err != nil {
   106  		return err
   107  	}
   108  	defer fs.Close()
   109  	f, err := fs.OpenRead(ctx, filename)
   110  	if err != nil {
   111  		return err
   112  	}
   113  	defer f.Close()
   114  
   115  	rd := riegeli.NewReader(f)
   116  	for {
   117  		var e spb.Entry
   118  		if err := rd.NextProto(&e); err == io.EOF {
   119  			return nil
   120  		} else if err != nil {
   121  			return err
   122  		}
   123  		emit(&e)
   124  	}
   125  }