github.com/google/osv-scalibr@v0.4.1/extractor/filesystem/embeddedfs/archive/archive.go (about)

     1  // Copyright 2025 Google LLC
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package archive provides an extractor for extracting software inventories from archives
    16  package archive
    17  
    18  import (
    19  	"compress/gzip"
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"strings"
    24  	"sync"
    25  
    26  	cpb "github.com/google/osv-scalibr/binary/proto/config_go_proto"
    27  	"github.com/google/osv-scalibr/extractor/filesystem"
    28  	"github.com/google/osv-scalibr/extractor/filesystem/embeddedfs/common"
    29  	scalibrfs "github.com/google/osv-scalibr/fs"
    30  	"github.com/google/osv-scalibr/inventory"
    31  	"github.com/google/osv-scalibr/plugin"
    32  )
    33  
    34  const (
    35  	// Name is the unique identifier for the archive extractor.
    36  	Name = "embeddedfs/archive"
    37  )
    38  
    39  // Extractor implements the filesystem.Extractor interface for archive extraction.
    40  type Extractor struct {
    41  	// maxFileSizeBytes is the maximum size of an archive file that can be traversed.
    42  	// If this limit is greater than zero and a file is encountered that is larger
    43  	// than this limit, the file is ignored.
    44  	maxFileSizeBytes int64
    45  }
    46  
    47  // New returns a new archive extractor.
    48  func New(cfg *cpb.PluginConfig) filesystem.Extractor {
    49  	maxSize := cfg.MaxFileSizeBytes
    50  	specific := plugin.FindConfig(cfg, func(c *cpb.PluginSpecificConfig) *cpb.ArchiveConfig { return c.GetArchive() })
    51  	if specific.GetMaxFileSizeBytes() > 0 {
    52  		maxSize = specific.GetMaxFileSizeBytes()
    53  	}
    54  	return &Extractor{maxFileSizeBytes: maxSize}
    55  }
    56  
    57  // Name returns the name of the extractor.
    58  func (e *Extractor) Name() string {
    59  	return Name
    60  }
    61  
    62  // Version returns the version of the extractor.
    63  func (e *Extractor) Version() int {
    64  	return 0
    65  }
    66  
    67  // Requirements returns the requirements for the extractor.
    68  func (e *Extractor) Requirements() *plugin.Capabilities {
    69  	return &plugin.Capabilities{}
    70  }
    71  
    72  // FileRequired checks if the file is a supported archive.
    73  func (e *Extractor) FileRequired(api filesystem.FileAPI) bool {
    74  	path := api.Path()
    75  	if !strings.HasSuffix(path, ".tar") && !strings.HasSuffix(path, ".tar.gz") {
    76  		return false
    77  	}
    78  
    79  	fileinfo, err := api.Stat()
    80  	if err != nil {
    81  		return false
    82  	}
    83  
    84  	if e.maxFileSizeBytes > 0 && fileinfo.Size() > e.maxFileSizeBytes {
    85  		return false
    86  	}
    87  
    88  	return true
    89  }
    90  
    91  // Extract returns an Inventory with embedded filesystems for the given archive file.
    92  func (e *Extractor) Extract(ctx context.Context, input *filesystem.ScanInput) (inventory.Inventory, error) {
    93  	if input.Reader == nil {
    94  		return inventory.Inventory{}, errors.New("input.Reader is nil")
    95  	}
    96  
    97  	var tempDir string
    98  	var err error
    99  	if strings.HasSuffix(input.Path, ".tar") {
   100  		tempDir, err = common.TARToTempDir(input.Reader)
   101  		if err != nil {
   102  			return inventory.Inventory{}, fmt.Errorf("common.TARToTempDir(%q): %w", input.Path, err)
   103  		}
   104  	} else if strings.HasSuffix(input.Path, ".tar.gz") {
   105  		reader, err := gzip.NewReader(input.Reader)
   106  		if err != nil {
   107  			return inventory.Inventory{}, fmt.Errorf("gzip.NewReader(%q): %w", input.Path, err)
   108  		}
   109  		tempDir, err = common.TARToTempDir(reader)
   110  		if err != nil {
   111  			return inventory.Inventory{}, fmt.Errorf("common.TARToTempDir(%q): %w", input.Path, err)
   112  		}
   113  	} else {
   114  		return inventory.Inventory{}, fmt.Errorf("%q not a supported archive format", input.Path)
   115  	}
   116  
   117  	var refCount int32 = 1
   118  	var refMu sync.Mutex
   119  	getEmbeddedFS := func(ctx context.Context) (scalibrfs.FS, error) {
   120  		return &common.EmbeddedDirFS{
   121  			FS:       scalibrfs.DirFS(tempDir),
   122  			File:     nil,
   123  			TmpPaths: []string{tempDir},
   124  			RefCount: &refCount,
   125  			RefMu:    &refMu,
   126  		}, nil
   127  	}
   128  	return inventory.Inventory{
   129  		EmbeddedFSs: []*inventory.EmbeddedFS{
   130  			{
   131  				Path:          input.Path,
   132  				GetEmbeddedFS: getEmbeddedFS,
   133  			}},
   134  	}, nil
   135  }