kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/pipeline/beamio/entries.go (about) 1 /* 2 * Copyright 2018 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package beamio 18 19 import ( 20 "context" 21 "fmt" 22 "io" 23 "strings" 24 25 "kythe.io/kythe/go/storage/stream" 26 "kythe.io/kythe/go/util/riegeli" 27 28 "github.com/apache/beam/sdks/go/pkg/beam" 29 "github.com/apache/beam/sdks/go/pkg/beam/io/filesystem" 30 31 spb "kythe.io/kythe/proto/storage_go_proto" 32 ) 33 34 func init() { 35 beam.RegisterFunction(readFile) 36 } 37 38 // ReadEntries reads a set of *spb.Entry messages into a PCollection from the 39 // given file, or files stored in a directory. The file can be part of any 40 // filesystem registered with the beam/io/filesystem package and can either be a 41 // delimited protobuf stream or a Riegeli file. 42 func ReadEntries(ctx context.Context, s beam.Scope, fileOrDir string) (beam.PCollection, error) { 43 if strings.HasSuffix(fileOrDir, "/") { 44 var errv beam.PCollection 45 fs, err := filesystem.New(ctx, fileOrDir) 46 if err != nil { 47 return errv, err 48 } 49 defer fs.Close() 50 files, err := fs.List(ctx, fileOrDir+"*") 51 if err != nil { 52 return errv, err 53 } 54 if len(files) == 0 { 55 return errv, fmt.Errorf("no entries found in %s - maybe mistyped path?", fileOrDir) 56 } 57 return beam.ParDo(s, readFile, beam.CreateList(s, files)), nil 58 } 59 return beam.ParDo(s, readFile, beam.Create(s, fileOrDir)), nil 60 } 61 62 func readFile(ctx context.Context, file string, emit func(*spb.Entry)) error { 63 if isRiegeli(ctx, file) { 64 return readRiegeli(ctx, file, emit) 65 } 66 return readStream(ctx, file, emit) 67 } 68 69 func isRiegeli(ctx context.Context, file string) bool { 70 fs, err := filesystem.New(ctx, file) 71 if err != nil { 72 return false 73 } 74 defer fs.Close() 75 f, err := fs.OpenRead(ctx, file) 76 if err != nil { 77 return false 78 } 79 defer f.Close() 80 rd := riegeli.NewReader(f) 81 if _, err := rd.RecordsMetadata(); err != nil { 82 return false 83 } 84 return true 85 } 86 87 func readStream(ctx context.Context, filename string, emit func(*spb.Entry)) error { 88 fs, err := filesystem.New(ctx, filename) 89 if err != nil { 90 return err 91 } 92 f, err := fs.OpenRead(ctx, filename) 93 if err != nil { 94 return err 95 } 96 defer f.Close() 97 for e := range stream.ReadEntries(f) { 98 emit(e) 99 } 100 return fs.Close() 101 } 102 103 func readRiegeli(ctx context.Context, filename string, emit func(*spb.Entry)) error { 104 fs, err := filesystem.New(ctx, filename) 105 if err != nil { 106 return err 107 } 108 defer fs.Close() 109 f, err := fs.OpenRead(ctx, filename) 110 if err != nil { 111 return err 112 } 113 defer f.Close() 114 115 rd := riegeli.NewReader(f) 116 for { 117 var e spb.Entry 118 if err := rd.NextProto(&e); err == io.EOF { 119 return nil 120 } else if err != nil { 121 return err 122 } 123 emit(&e) 124 } 125 }