github.com/anchore/syft@v1.38.2/syft/pkg/cataloger/ai/parse_gguf_model.go (about)

     1  package ai
     2  
     3  import (
     4  	"context"
     5  	"encoding/json"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"path/filepath"
    10  	"sort"
    11  	"strings"
    12  
    13  	"github.com/cespare/xxhash/v2"
    14  	gguf_parser "github.com/gpustack/gguf-parser-go"
    15  
    16  	"github.com/anchore/syft/internal"
    17  	"github.com/anchore/syft/internal/log"
    18  	"github.com/anchore/syft/internal/unknown"
    19  	"github.com/anchore/syft/syft/artifact"
    20  	"github.com/anchore/syft/syft/file"
    21  	"github.com/anchore/syft/syft/pkg"
    22  	"github.com/anchore/syft/syft/pkg/cataloger/generic"
    23  )
    24  
    25  // parseGGUFModel parses a GGUF model file and returns the discovered package.
    26  // This implementation only reads the header portion of the file, not the entire model.
    27  func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
    28  	defer internal.CloseAndLogError(reader, reader.Path())
    29  
    30  	// Create a temporary file for the library to parse
    31  	// The library requires a file path, so we create a temp file
    32  	tempFile, err := os.CreateTemp("", "syft-gguf-*.gguf")
    33  	if err != nil {
    34  		return nil, nil, fmt.Errorf("failed to create temp file: %w", err)
    35  	}
    36  	tempPath := tempFile.Name()
    37  	defer os.Remove(tempPath)
    38  
    39  	// Copy and validate the GGUF file header using LimitedReader to prevent OOM
    40  	// We use LimitedReader to cap reads at maxHeaderSize (50MB)
    41  	limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
    42  	if err := copyHeader(tempFile, limitedReader); err != nil {
    43  		tempFile.Close()
    44  		return nil, nil, fmt.Errorf("failed to copy GGUF header: %w", err)
    45  	}
    46  	tempFile.Close()
    47  
    48  	// Parse using gguf-parser-go with options to skip unnecessary data
    49  	ggufFile, err := gguf_parser.ParseGGUFFile(tempPath,
    50  		gguf_parser.SkipLargeMetadata(),
    51  	)
    52  	if err != nil {
    53  		return nil, nil, fmt.Errorf("failed to parse GGUF file: %w", err)
    54  	}
    55  
    56  	// Extract metadata
    57  	metadata := ggufFile.Metadata()
    58  
    59  	// Extract version separately (will be set on Package.Version)
    60  	modelVersion := extractVersion(ggufFile.Header.MetadataKV)
    61  
    62  	// Convert to syft metadata structure
    63  	syftMetadata := &pkg.GGUFFileHeader{
    64  		Architecture:          metadata.Architecture,
    65  		Quantization:          metadata.FileTypeDescriptor,
    66  		Parameters:            uint64(metadata.Parameters),
    67  		GGUFVersion:           uint32(ggufFile.Header.Version),
    68  		TensorCount:           ggufFile.Header.TensorCount,
    69  		RemainingKeyValues:    convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
    70  		MetadataKeyValuesHash: computeKVMetadataHash(ggufFile.Header.MetadataKV),
    71  	}
    72  
    73  	// If model name is not in metadata, use filename
    74  	if metadata.Name == "" {
    75  		metadata.Name = extractModelNameFromPath(reader.Path())
    76  	}
    77  
    78  	// Create package from metadata
    79  	p := newGGUFPackage(
    80  		syftMetadata,
    81  		metadata.Name,
    82  		modelVersion,
    83  		metadata.License,
    84  		reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
    85  	)
    86  
    87  	return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse GGUF file")
    88  }
    89  
    90  // computeKVMetadataHash computes a stable hash of the KV metadata for use as a global identifier
    91  func computeKVMetadataHash(metadata gguf_parser.GGUFMetadataKVs) string {
    92  	// Sort the KV pairs by key for stable hashing
    93  	sortedKVs := make([]gguf_parser.GGUFMetadataKV, len(metadata))
    94  	copy(sortedKVs, metadata)
    95  	sort.Slice(sortedKVs, func(i, j int) bool {
    96  		return sortedKVs[i].Key < sortedKVs[j].Key
    97  	})
    98  
    99  	// Marshal sorted KVs to JSON for stable hashing
   100  	jsonBytes, err := json.Marshal(sortedKVs)
   101  	if err != nil {
   102  		log.Debugf("failed to marshal metadata for hashing: %v", err)
   103  		return ""
   104  	}
   105  
   106  	// Compute xxhash
   107  	hash := xxhash.Sum64(jsonBytes)
   108  	return fmt.Sprintf("%016x", hash) // 16 hex chars (64 bits)
   109  }
   110  
   111  // extractVersion attempts to extract version from metadata KV pairs
   112  func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string {
   113  	for _, kv := range kvs {
   114  		if kv.Key == "general.version" {
   115  			if v, ok := kv.Value.(string); ok && v != "" {
   116  				return v
   117  			}
   118  		}
   119  	}
   120  	return ""
   121  }
   122  
   123  // extractModelNameFromPath extracts the model name from the file path
   124  func extractModelNameFromPath(path string) string {
   125  	// Get the base filename
   126  	base := filepath.Base(path)
   127  
   128  	// Remove .gguf extension
   129  	name := strings.TrimSuffix(base, ".gguf")
   130  
   131  	return name
   132  }
   133  
   134  // integrity check
   135  var _ generic.Parser = parseGGUFModel